llama_core/metadata/
ggml.rs

1//! Define metadata for the ggml model.
2
3use super::BaseMetadata;
4use chat_prompts::PromptTemplateType;
5use serde::{Deserialize, Serialize};
6use std::path::{Path, PathBuf};
7
8/// Builder for creating a ggml metadata
9#[derive(Debug)]
10pub struct GgmlMetadataBuilder {
11    metadata: GgmlMetadata,
12}
13impl GgmlMetadataBuilder {
14    pub fn new<S: Into<String>>(model_name: S, model_alias: S, pt: PromptTemplateType) -> Self {
15        let metadata = GgmlMetadata {
16            model_name: model_name.into(),
17            model_alias: model_alias.into(),
18            prompt_template: pt,
19            ..Default::default()
20        };
21
22        Self { metadata }
23    }
24
25    pub fn with_prompt_template(mut self, template: PromptTemplateType) -> Self {
26        self.metadata.prompt_template = template;
27        self
28    }
29
30    pub fn enable_plugin_log(mut self, enable: bool) -> Self {
31        self.metadata.log_enable = enable;
32        self
33    }
34
35    pub fn enable_debug_log(mut self, enable: bool) -> Self {
36        self.metadata.debug_log = enable;
37        self
38    }
39
40    pub fn enable_prompts_log(mut self, enable: bool) -> Self {
41        self.metadata.log_prompts = enable;
42        self
43    }
44
45    pub fn enable_embeddings(mut self, enable: bool) -> Self {
46        self.metadata.embeddings = enable;
47        self
48    }
49
50    pub fn with_n_predict(mut self, n: i32) -> Self {
51        self.metadata.n_predict = n;
52        self
53    }
54
55    pub fn with_main_gpu(mut self, gpu: Option<u64>) -> Self {
56        self.metadata.main_gpu = gpu;
57        self
58    }
59
60    pub fn with_tensor_split(mut self, split: Option<String>) -> Self {
61        self.metadata.tensor_split = split;
62        self
63    }
64
65    pub fn with_threads(mut self, threads: u64) -> Self {
66        self.metadata.threads = threads;
67        self
68    }
69
70    pub fn with_reverse_prompt(mut self, prompt: Option<String>) -> Self {
71        self.metadata.reverse_prompt = prompt;
72        self
73    }
74
75    pub fn with_mmproj(mut self, path: Option<String>) -> Self {
76        self.metadata.mmproj = path;
77        self
78    }
79
80    pub fn with_image(mut self, path: impl Into<String>) -> Self {
81        self.metadata.image = Some(path.into());
82        self
83    }
84
85    pub fn with_n_gpu_layers(mut self, n: u64) -> Self {
86        self.metadata.n_gpu_layers = n;
87        self
88    }
89
90    pub fn disable_mmap(mut self, disable: Option<bool>) -> Self {
91        self.metadata.use_mmap = disable.map(|v| !v);
92        self
93    }
94
95    pub fn with_split_mode(mut self, mode: String) -> Self {
96        self.metadata.split_mode = mode;
97        self
98    }
99
100    pub fn with_ctx_size(mut self, size: u64) -> Self {
101        self.metadata.ctx_size = size;
102        self
103    }
104
105    pub fn with_batch_size(mut self, size: u64) -> Self {
106        self.metadata.batch_size = size;
107        self
108    }
109
110    pub fn with_ubatch_size(mut self, size: u64) -> Self {
111        self.metadata.ubatch_size = size;
112        self
113    }
114
115    pub fn with_temperature(mut self, temp: f64) -> Self {
116        self.metadata.temperature = temp;
117        self
118    }
119
120    pub fn with_top_p(mut self, top_p: f64) -> Self {
121        self.metadata.top_p = top_p;
122        self
123    }
124
125    pub fn with_repeat_penalty(mut self, penalty: f64) -> Self {
126        self.metadata.repeat_penalty = penalty;
127        self
128    }
129
130    pub fn with_presence_penalty(mut self, penalty: f64) -> Self {
131        self.metadata.presence_penalty = penalty;
132        self
133    }
134
135    pub fn with_frequency_penalty(mut self, penalty: f64) -> Self {
136        self.metadata.frequency_penalty = penalty;
137        self
138    }
139
140    pub fn with_grammar(mut self, grammar: impl Into<String>) -> Self {
141        self.metadata.grammar = grammar.into();
142        self
143    }
144
145    pub fn with_json_schema(mut self, schema: Option<String>) -> Self {
146        self.metadata.json_schema = schema;
147        self
148    }
149
150    pub fn include_usage(mut self, include: bool) -> Self {
151        self.metadata.include_usage = include;
152        self
153    }
154
155    pub fn build(self) -> GgmlMetadata {
156        self.metadata
157    }
158}
159
160/// Metadata for chat and embeddings models
161#[derive(Debug, Clone, Deserialize, Serialize)]
162pub struct GgmlMetadata {
163    // this field not defined for the beckend plugin
164    #[serde(skip_serializing)]
165    pub model_name: String,
166    // this field not defined for the beckend plugin
167    #[serde(skip_serializing)]
168    pub model_alias: String,
169    // this field not defined for the beckend plugin
170    #[serde(skip_serializing)]
171    pub log_prompts: bool,
172    // this field not defined for the beckend plugin
173    #[serde(skip_serializing)]
174    pub prompt_template: PromptTemplateType,
175
176    // * Plugin parameters (used by this plugin):
177    #[serde(rename = "enable-log")]
178    pub log_enable: bool,
179    #[serde(rename = "enable-debug-log")]
180    pub debug_log: bool,
181    // #[serde(rename = "stream-stdout")]
182    // pub stream_stdout: bool,
183    #[serde(rename = "embedding")]
184    pub embeddings: bool,
185    /// Number of tokens to predict, -1 = infinity, -2 = until context filled. Defaults to -1.
186    #[serde(rename = "n-predict")]
187    pub n_predict: i32,
188    /// Halt generation at PROMPT, return control in interactive mode.
189    #[serde(skip_serializing_if = "Option::is_none", rename = "reverse-prompt")]
190    pub reverse_prompt: Option<String>,
191    /// path to the multimodal projector file for llava
192    #[serde(skip_serializing_if = "Option::is_none")]
193    pub mmproj: Option<String>,
194    /// Path to the image file for llava
195    #[serde(skip_serializing_if = "Option::is_none")]
196    pub image: Option<String>,
197
198    // * Model parameters (need to reload the model if updated):
199    #[serde(rename = "n-gpu-layers")]
200    pub n_gpu_layers: u64,
201    /// The main GPU to use. Defaults to None.
202    #[serde(rename = "main-gpu")]
203    #[serde(skip_serializing_if = "Option::is_none")]
204    pub main_gpu: Option<u64>,
205    /// How split tensors should be distributed accross GPUs. If None the model is not split; otherwise, a comma-separated list of non-negative values, e.g., "3,2" presents 60% of the data to GPU 0 and 40% to GPU 1. Defaults to None.
206    #[serde(rename = "tensor-split")]
207    #[serde(skip_serializing_if = "Option::is_none")]
208    pub tensor_split: Option<String>,
209    /// Whether to use memory-mapped files for the model. Defaults to `true`.
210    #[serde(skip_serializing_if = "Option::is_none", rename = "use-mmap")]
211    pub use_mmap: Option<bool>,
212    /// How to split the model across multiple GPUs. Possible values:
213    /// - `none`: use one GPU only
214    /// - `layer`: split layers and KV across GPUs (default)
215    /// - `row`: split rows across GPUs
216    #[serde(rename = "split-mode")]
217    pub split_mode: String,
218
219    // * Context parameters (used by the llama context):
220    /// Size of the prompt context. 0 means loaded from model. Defaults to 4096.
221    #[serde(rename = "ctx-size")]
222    pub ctx_size: u64,
223    /// Logical maximum batch size. Defaults to 2048.
224    #[serde(rename = "batch-size")]
225    pub batch_size: u64,
226    /// Physical maximum batch size. Defaults to 512.
227    #[serde(rename = "ubatch-size")]
228    pub ubatch_size: u64,
229    /// Number of threads to use during generation. Defaults to 2.
230    #[serde(rename = "threads")]
231    pub threads: u64,
232
233    // * Sampling parameters (used by the llama sampling context).
234    /// Adjust the randomness of the generated text. Between 0.0 and 2.0. Defaults to 0.8.
235    #[serde(rename = "temp")]
236    pub temperature: f64,
237    /// Top-p sampling. Between 0.0 and 1.0. Defaults to 0.9.
238    #[serde(rename = "top-p")]
239    pub top_p: f64,
240    /// Penalize repeat sequence of tokens. Defaults to 1.0.
241    #[serde(rename = "repeat-penalty")]
242    pub repeat_penalty: f64,
243    /// Repeat alpha presence penalty. Defaults to 0.0.
244    #[serde(rename = "presence-penalty")]
245    pub presence_penalty: f64,
246    /// Repeat alpha frequency penalty. Defaults to 0.0.
247    #[serde(rename = "frequency-penalty")]
248    pub frequency_penalty: f64,
249
250    // * grammar parameters
251    /// BNF-like grammar to constrain generations (see samples in grammars/ dir). Defaults to empty string.
252    pub grammar: String,
253    /// JSON schema to constrain generations (<https://json-schema.org/>), e.g. `{}` for any JSON object. For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead.
254    #[serde(skip_serializing_if = "Option::is_none")]
255    pub json_schema: Option<String>,
256
257    /// Whether to include usage in the stream response. Defaults to false.
258    pub include_usage: bool,
259}
260impl Default for GgmlMetadata {
261    fn default() -> Self {
262        Self {
263            model_name: String::new(),
264            model_alias: String::new(),
265            log_prompts: false,
266            debug_log: false,
267            prompt_template: PromptTemplateType::Llama2Chat,
268            log_enable: false,
269            embeddings: false,
270            n_predict: -1,
271            reverse_prompt: None,
272            mmproj: None,
273            image: None,
274            n_gpu_layers: 100,
275            main_gpu: None,
276            tensor_split: None,
277            use_mmap: Some(true),
278            split_mode: "layer".to_string(),
279            ctx_size: 4096,
280            batch_size: 2048,
281            ubatch_size: 512,
282            threads: 2,
283            temperature: 0.8,
284            top_p: 0.9,
285            repeat_penalty: 1.0,
286            presence_penalty: 0.0,
287            frequency_penalty: 0.0,
288            grammar: String::new(),
289            json_schema: None,
290            include_usage: false,
291        }
292    }
293}
294impl BaseMetadata for GgmlMetadata {
295    fn model_name(&self) -> &str {
296        &self.model_name
297    }
298
299    fn model_alias(&self) -> &str {
300        &self.model_alias
301    }
302}
303impl GgmlMetadata {
304    pub fn prompt_template(&self) -> PromptTemplateType {
305        self.prompt_template
306    }
307}
308
309/// Builder for creating a ggml tts metadata
310#[derive(Debug)]
311pub struct GgmlTtsMetadataBuilder {
312    metadata: GgmlTtsMetadata,
313}
314impl GgmlTtsMetadataBuilder {
315    pub fn new<S: Into<String>, P: AsRef<Path>>(
316        model_name: S,
317        model_alias: S,
318        codec_model: P,
319    ) -> Self {
320        let metadata = GgmlTtsMetadata {
321            model_name: model_name.into(),
322            model_alias: model_alias.into(),
323            codec_model: codec_model.as_ref().to_path_buf(),
324            ..Default::default()
325        };
326
327        Self { metadata }
328    }
329
330    pub fn enable_tts(mut self, enable: bool) -> Self {
331        self.metadata.enable_tts = enable;
332        self
333    }
334
335    pub fn with_speaker_file(mut self, speaker_file: Option<PathBuf>) -> Self {
336        self.metadata.speaker_file = speaker_file;
337        self
338    }
339
340    pub fn with_ctx_size(mut self, size: u64) -> Self {
341        self.metadata.ctx_size = size;
342        self
343    }
344
345    pub fn with_batch_size(mut self, size: u64) -> Self {
346        self.metadata.batch_size = size;
347        self
348    }
349
350    pub fn with_ubatch_size(mut self, size: u64) -> Self {
351        self.metadata.ubatch_size = size;
352        self
353    }
354
355    pub fn with_n_predict(mut self, n: i32) -> Self {
356        self.metadata.n_predict = n;
357        self
358    }
359
360    pub fn with_n_gpu_layers(mut self, n: u64) -> Self {
361        self.metadata.n_gpu_layers = n;
362        self
363    }
364
365    pub fn with_temperature(mut self, temp: f64) -> Self {
366        self.metadata.temperature = temp;
367        self
368    }
369
370    pub fn enable_plugin_log(mut self, enable: bool) -> Self {
371        self.metadata.log_enable = enable;
372        self
373    }
374
375    pub fn enable_debug_log(mut self, enable: bool) -> Self {
376        self.metadata.debug_log = enable;
377        self
378    }
379
380    pub fn build(self) -> GgmlTtsMetadata {
381        self.metadata
382    }
383}
384
385/// Metadata for TTS models
386#[derive(Debug, Clone, Deserialize, Serialize)]
387pub struct GgmlTtsMetadata {
388    pub model_name: String,
389    pub model_alias: String,
390    #[serde(rename = "tts")]
391    pub enable_tts: bool,
392    #[serde(rename = "model-vocoder")]
393    pub codec_model: PathBuf,
394    #[serde(rename = "tts-speaker-file", skip_serializing_if = "Option::is_none")]
395    pub speaker_file: Option<PathBuf>,
396    #[serde(rename = "ctx-size")]
397    pub ctx_size: u64,
398    #[serde(rename = "batch-size")]
399    pub batch_size: u64,
400    #[serde(rename = "ubatch-size")]
401    pub ubatch_size: u64,
402    pub n_predict: i32,
403    pub n_gpu_layers: u64,
404    #[serde(rename = "temp")]
405    pub temperature: f64,
406    #[serde(rename = "enable-log")]
407    pub log_enable: bool,
408    #[serde(rename = "enable-debug-log")]
409    pub debug_log: bool,
410}
411impl Default for GgmlTtsMetadata {
412    fn default() -> Self {
413        Self {
414            model_name: "tts".to_string(),
415            model_alias: "tts".to_string(),
416            enable_tts: false,
417            codec_model: PathBuf::from(""),
418            speaker_file: None,
419            ctx_size: 8192,
420            batch_size: 8192,
421            ubatch_size: 8192,
422            n_predict: 4096,
423            n_gpu_layers: 100,
424            temperature: 0.8,
425            log_enable: false,
426            debug_log: false,
427        }
428    }
429}
430impl BaseMetadata for GgmlTtsMetadata {
431    fn model_name(&self) -> &str {
432        &self.model_name
433    }
434
435    fn model_alias(&self) -> &str {
436        &self.model_alias
437    }
438}
439impl GgmlTtsMetadata {
440    pub fn prompt_template(&self) -> PromptTemplateType {
441        PromptTemplateType::Tts
442    }
443}
llama_core/metadata/ggml.rs

llama_core/metadata/
ggml.rs