Ashx098 commited on
Commit
a433a25
·
verified ·
1 Parent(s): 718421a

Upload folder using huggingface_hub

Browse files
Tokenizer/BPE/special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<user>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<assistant>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<system>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
+ "bos_token": {
26
+ "content": "<s>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "eos_token": {
33
+ "content": "</s>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "unk_token": {
47
+ "content": "<unk>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ }
53
+ }
Tokenizer/BPE/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:debb3ad91c0745bb304129ee9a0332a33c2bb1fffe7313d573608e5014ab69bb
3
+ size 747364
Tokenizer/BPE/spm.vocab ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/BPE/test_tokenizer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tok = AutoTokenizer.from_pretrained("./Tokenizer/BPE")
3
+
4
+
5
+ text1 = "Hello world! <user> write code </s>"
6
+ text2 = "myHTTPRequestHandler is calling process_payment_v2"
7
+ text3 = "methylphenidate hydrochloride dopamine reuptake modulation"
8
+ text4 = "hello 🔥🔥🔥💀💀"
9
+ text5 = "https://github.com/Avinash-MiniLLM?tab=repos"
10
+
11
+
12
+ print(text1)
13
+ print(text2)
14
+ print(text3)
15
+ print(text4)
16
+ print(text5)
17
+
18
+ print(tok.tokenize(text1))
19
+ print(tok.tokenize(text2))
20
+ print(tok.tokenize(text3))
21
+ print(tok.tokenize(text4))
22
+ print(tok.tokenize(text5))
23
+
24
+
25
+ ids1 = tok.encode(text1)
26
+ ids2 = tok.encode(text2)
27
+ ids3 = tok.encode(text3)
28
+ ids4 = tok.encode(text4)
29
+ ids5 = tok.encode(text5)
30
+
31
+ print(ids1)
32
+ print(tok.decode(ids1))
33
+ print(tok.decode(ids1, skip_special_tokens=True))
34
+
35
+ print(ids2)
36
+ print(tok.decode(ids2))
37
+ print(tok.decode(ids2, skip_special_tokens=True))
38
+
39
+ print(ids3)
40
+ print(tok.decode(ids3))
41
+ print(tok.decode(ids3, skip_special_tokens=True))
42
+
43
+ ids4 = tok.encode(text4)
44
+ print(ids4)
45
+ print(tok.decode(ids4))
46
+ print(tok.decode(ids4, skip_special_tokens=True))
47
+
48
+ ids5 = tok.encode(text5)
49
+ print(ids5)
50
+ print(tok.decode(ids5))
51
+ print(tok.decode(ids5, skip_special_tokens=True))
Tokenizer/BPE/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/BPE/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:debb3ad91c0745bb304129ee9a0332a33c2bb1fffe7313d573608e5014ab69bb
3
+ size 747364
Tokenizer/BPE/tokenizer_config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<user>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<assistant>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<system>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ }
62
+ },
63
+ "additional_special_tokens": [
64
+ "<user>",
65
+ "<assistant>",
66
+ "<system>"
67
+ ],
68
+ "bos_token": "<s>",
69
+ "clean_up_tokenization_spaces": false,
70
+ "eos_token": "</s>",
71
+ "extra_special_tokens": {},
72
+ "legacy": true,
73
+ "model_max_length": 1000000000000000019884624838656,
74
+ "pad_token": "<pad>",
75
+ "sp_model_kwargs": {},
76
+ "spaces_between_special_tokens": false,
77
+ "tokenizer_class": "LlamaTokenizer",
78
+ "unk_token": "<unk>",
79
+ "use_default_system_prompt": false
80
+ }
Tokenizer/README.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tokenizer Module
2
+
3
+ This module handles all tokenization tasks for the Mini-LLM project, converting raw text into numerical tokens that the model can process.
4
+
5
+ ## Overview
6
+
7
+ The tokenizer uses **SentencePiece** with **Byte Pair Encoding (BPE)** to create a 32,000 token vocabulary. BPE is the same algorithm used by GPT-3, GPT-4, and LLaMA models.
8
+
9
+ ## Directory Structure
10
+
11
+ ```
12
+ Tokenizer/
13
+ ├── BPE/ # BPE tokenizer artifacts
14
+ │ ├── spm.model # Trained SentencePiece model
15
+ │ ├── spm.vocab # Vocabulary file
16
+ │ ├── tokenizer.json # HuggingFace format
17
+ │ ├── tokenizer_config.json
18
+ │ └── special_tokens_map.json
19
+ ├── Unigram/ # Unigram tokenizer (baseline)
20
+ │ └── ...
21
+ ├── train_spm_bpe.py # Train BPE tokenizer
22
+ ├── train_spm_unigram.py # Train Unigram tokenizer
23
+ └── convert_to_hf.py # Convert to HuggingFace format
24
+ ```
25
+
26
+ ## How It Works
27
+
28
+ ### 1. Training the Tokenizer
29
+
30
+ **Script**: `train_spm_bpe.py`
31
+
32
+ ```python
33
+ import sentencepiece as spm
34
+
35
+ spm.SentencePieceTrainer.Train(
36
+ input="data/raw/merged_text/corpus.txt",
37
+ model_prefix="Tokenizer/BPE/spm",
38
+ vocab_size=32000,
39
+ model_type="bpe",
40
+ byte_fallback=True, # Handles emojis, special chars
41
+ character_coverage=1.0,
42
+ user_defined_symbols=["<user>", "<assistant>", "<system>"]
43
+ )
44
+ ```
45
+
46
+ **What happens:**
47
+ 1. Reads raw text corpus
48
+ 2. Learns byte-pair merges (e.g., "th" + "e" → "the")
49
+ 3. Builds 32,000 most frequent tokens
50
+ 4. Saves model to `spm.model`
51
+
52
+ ### 2. Example: Tokenization Process
53
+
54
+ **Input Text:**
55
+ ```
56
+ "Hello world! <user> write code </s>"
57
+ ```
58
+
59
+ **Tokenization Steps:**
60
+
61
+ ```
62
+ ┌─────────────────────────────────────────┐
63
+ │ 1. Text Input │
64
+ │ "Hello world! <user> write code" │
65
+ └─────────────────────────────────────────┘
66
+
67
+ ┌─────────────────────────────────────────┐
68
+ │ 2. BPE Segmentation │
69
+ │ ['H', 'ello', '▁world', '!', │
70
+ │ '▁', '<user>', '▁write', '▁code'] │
71
+ └─────────────────────────────────────────┘
72
+
73
+ ┌─────────────────────────────────────────┐
74
+ │ 3. Token IDs │
75
+ │ [334, 3855, 288, 267, 2959, │
76
+ │ 354, 267, 12397] │
77
+ └─────────────────────────────────────────┘
78
+ ```
79
+
80
+ **Key Features:**
81
+ - `▁` represents space (SentencePiece convention)
82
+ - Special tokens like `<user>` are preserved
83
+ - Byte fallback handles emojis: 🔥 → `<0xF0><0x9F><0x94><0xA5>`
84
+
85
+ ### 3. Converting to HuggingFace Format
86
+
87
+ **Script**: `convert_to_hf.py`
88
+
89
+ ```python
90
+ from transformers import LlamaTokenizerFast
91
+
92
+ tokenizer = LlamaTokenizerFast(vocab_file="Tokenizer/BPE/spm.model")
93
+ tokenizer.add_special_tokens({
94
+ 'bos_token': '<s>',
95
+ 'eos_token': '</s>',
96
+ 'unk_token': '<unk>',
97
+ 'pad_token': '<pad>'
98
+ })
99
+ tokenizer.save_pretrained("Tokenizer/BPE")
100
+ ```
101
+
102
+ This creates `tokenizer.json` and config files compatible with HuggingFace Transformers.
103
+
104
+ ## Usage
105
+
106
+ ### Load Tokenizer
107
+
108
+ ```python
109
+ from transformers import AutoTokenizer
110
+
111
+ tokenizer = AutoTokenizer.from_pretrained("Tokenizer/BPE")
112
+ ```
113
+
114
+ ### Encode Text
115
+
116
+ ```python
117
+ text = "Hello world!"
118
+ ids = tokenizer.encode(text)
119
+ # Output: [1, 334, 3855, 288, 267, 2]
120
+ # [<s>, H, ello, ▁world, !, </s>]
121
+ ```
122
+
123
+ ### Decode IDs
124
+
125
+ ```python
126
+ decoded = tokenizer.decode(ids)
127
+ # Output: "<s> Hello world! </s>"
128
+
129
+ decoded = tokenizer.decode(ids, skip_special_tokens=True)
130
+ # Output: "Hello world!"
131
+ ```
132
+
133
+ ## BPE vs Unigram
134
+
135
+ | Feature | BPE | Unigram |
136
+ |---------|-----|---------|
137
+ | **Algorithm** | Merge frequent pairs | Probabilistic segmentation |
138
+ | **Emoji Handling** | ✅ Byte fallback | ❌ Creates `<unk>` |
139
+ | **URL Handling** | ✅ Clean splits | ⚠️ Unstable |
140
+ | **Used By** | GPT-3, GPT-4, LLaMA | BERT, T5 |
141
+ | **Recommendation** | ✅ **Primary** | Baseline only |
142
+
143
+ ## Vocabulary Statistics
144
+
145
+ - **Total Tokens**: 32,000
146
+ - **Special Tokens**: 4 (`<s>`, `</s>`, `<unk>`, `<pad>`)
147
+ - **User-Defined**: 3 (`<user>`, `<assistant>`, `<system>`)
148
+ - **Coverage**: 100% (byte fallback ensures no `<unk>`)
149
+
150
+ ## Performance
151
+
152
+ - **Compression Ratio**: ~3.5 bytes/token (English text)
153
+ - **Tokenization Speed**: ~1M tokens/second
154
+ - **Vocab Usage**: ~70% of tokens used in typical corpus
155
+
156
+ ## References
157
+
158
+ - [SentencePiece Documentation](https://github.com/google/sentencepiece)
159
+ - [BPE Paper (Sennrich et al., 2016)](https://arxiv.org/abs/1508.07909)
160
+ - [Tokenizer Comparison Report](../tokenizer_report.md)
Tokenizer/Unigram/special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<user>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<assistant>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<system>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
+ "bos_token": {
26
+ "content": "<s>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "eos_token": {
33
+ "content": "</s>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "unk_token": {
47
+ "content": "<unk>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ }
53
+ }
Tokenizer/Unigram/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:247ddb0f3561179d04614a59d7eb594da59ad881575a6d3860f859be9b709508
3
+ size 768238
Tokenizer/Unigram/spm.vocab ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/Unigram/test_tokenizer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tok = AutoTokenizer.from_pretrained("./Tokenizer/Unigram")
3
+
4
+
5
+ text1 = "Hello world! <user> write code </s>"
6
+ text2 = "myHTTPRequestHandler is calling process_payment_v2"
7
+ text3 = "methylphenidate hydrochloride dopamine reuptake modulation"
8
+ text4 = "hello 🔥🔥🔥💀💀"
9
+ text5 = "https://github.com/Avinash-MiniLLM?tab=repos"
10
+
11
+
12
+ print(text1)
13
+ print(text2)
14
+ print(text3)
15
+ print(text4)
16
+ print(text5)
17
+
18
+ print(tok.tokenize(text1))
19
+ print(tok.tokenize(text2))
20
+ print(tok.tokenize(text3))
21
+ print(tok.tokenize(text4))
22
+ print(tok.tokenize(text5))
23
+
24
+
25
+ ids1 = tok.encode(text1)
26
+ ids2 = tok.encode(text2)
27
+ ids3 = tok.encode(text3)
28
+ ids4 = tok.encode(text4)
29
+ ids5 = tok.encode(text5)
30
+
31
+ print(ids1)
32
+ print(tok.decode(ids1))
33
+ print(tok.decode(ids1, skip_special_tokens=True))
34
+
35
+ print(ids2)
36
+ print(tok.decode(ids2))
37
+ print(tok.decode(ids2, skip_special_tokens=True))
38
+
39
+ print(ids3)
40
+ print(tok.decode(ids3))
41
+ print(tok.decode(ids3, skip_special_tokens=True))
42
+
43
+ ids4 = tok.encode(text4)
44
+ print(ids4)
45
+ print(tok.decode(ids4))
46
+ print(tok.decode(ids4, skip_special_tokens=True))
47
+
48
+ ids5 = tok.encode(text5)
49
+ print(ids5)
50
+ print(tok.decode(ids5))
51
+ print(tok.decode(ids5, skip_special_tokens=True))
Tokenizer/Unigram/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/Unigram/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:247ddb0f3561179d04614a59d7eb594da59ad881575a6d3860f859be9b709508
3
+ size 768238
Tokenizer/Unigram/tokenizer_config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<user>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<assistant>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<system>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ }
62
+ },
63
+ "additional_special_tokens": [
64
+ "<user>",
65
+ "<assistant>",
66
+ "<system>"
67
+ ],
68
+ "bos_token": "<s>",
69
+ "clean_up_tokenization_spaces": false,
70
+ "eos_token": "</s>",
71
+ "extra_special_tokens": {},
72
+ "legacy": true,
73
+ "model_max_length": 1000000000000000019884624838656,
74
+ "pad_token": "<pad>",
75
+ "sp_model_kwargs": {},
76
+ "spaces_between_special_tokens": false,
77
+ "tokenizer_class": "LlamaTokenizer",
78
+ "unk_token": "<unk>",
79
+ "use_default_system_prompt": false
80
+ }
Tokenizer/convert_to_hf.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import LlamaTokenizerFast
2
+
3
+ # Load the raw spm model
4
+ tokenizer = LlamaTokenizerFast(vocab_file="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm.model")
5
+
6
+ # Add your special tokens manually to the HF config part
7
+ tokenizer.add_special_tokens({
8
+ "bos_token": "<s>",
9
+ "eos_token": "</s>",
10
+ "unk_token": "<unk>",
11
+ "pad_token": "<pad>",
12
+ "additional_special_tokens": ["<user>", "<assistant>", "<system>"]
13
+ })
14
+
15
+ # Save the json version
16
+ tokenizer.save_pretrained("Tokenizer/")
17
+
18
+ print("Converted to tokenizer.json successfully!")
Tokenizer/test_tokenizer.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tok = AutoTokenizer.from_pretrained(".")
3
+ print(tok.tokenize("Hello world! <user> write code </s>"))
4
+
5
+ text = "Hello world! <user> write code </s>"
6
+ ids = tok.encode(text)
7
+ print(ids)
8
+ print(tok.decode(ids))
9
+ print(tok.decode(ids, skip_special_tokens=True))
Tokenizer/train_spm_bpe.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+
3
+ spm.SentencePieceTrainer.Train(
4
+ input="/home/aviinashh/projects/Mini-LLM/data/raw/merged_text/corpus.txt",
5
+ model_prefix="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm",
6
+ vocab_size=32000,
7
+ model_type="bpe",
8
+ byte_fallback=True,
9
+ character_coverage=1.0,
10
+ unk_id=0,
11
+ bos_id=1,
12
+ eos_id=2,
13
+ pad_id=3,
14
+ user_defined_symbols=["<user>", "<assistant>", "<system>"],
15
+ )
16
+
17
+ print("Tokenizer trained!")
18
+ # Model and vocab will be saved as spm.model and spm.vocab in the specified path
Tokenizer/train_spm_unigram.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+
3
+ spm.SentencePieceTrainer.Train(
4
+ input="/home/aviinashh/projects/Mini-LLM/data/raw/merged_text/corpus.txt",
5
+ model_prefix="/home/aviinashh/projects/Mini-LLM/Tokenizer/spm",
6
+ vocab_size=32000,
7
+ model_type="unigram",
8
+ character_coverage=1.0,
9
+ unk_id=0,
10
+ bos_id=1,
11
+ eos_id=2,
12
+ pad_id=3,
13
+ user_defined_symbols=["<user>", "<assistant>", "<system>"],
14
+ )
15
+
16
+ print("Tokenizer trained!")
17
+ # Model and vocab will be saved as spm.model and spm.vocab in the specified path
Upload folder using huggingface_hub · Ashx098/Mini-LLM at a433a25
Ashx098 commited on
Commit
a433a25
·
verified ·
1 Parent(s): 718421a

Upload folder using huggingface_hub

Browse files
Tokenizer/BPE/special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<user>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<assistant>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<system>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
+ "bos_token": {
26
+ "content": "<s>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "eos_token": {
33
+ "content": "</s>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "unk_token": {
47
+ "content": "<unk>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ }
53
+ }
Tokenizer/BPE/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:debb3ad91c0745bb304129ee9a0332a33c2bb1fffe7313d573608e5014ab69bb
3
+ size 747364
Tokenizer/BPE/spm.vocab ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/BPE/test_tokenizer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tok = AutoTokenizer.from_pretrained("./Tokenizer/BPE")
3
+
4
+
5
+ text1 = "Hello world! <user> write code </s>"
6
+ text2 = "myHTTPRequestHandler is calling process_payment_v2"
7
+ text3 = "methylphenidate hydrochloride dopamine reuptake modulation"
8
+ text4 = "hello 🔥🔥🔥💀💀"
9
+ text5 = "https://github.com/Avinash-MiniLLM?tab=repos"
10
+
11
+
12
+ print(text1)
13
+ print(text2)
14
+ print(text3)
15
+ print(text4)
16
+ print(text5)
17
+
18
+ print(tok.tokenize(text1))
19
+ print(tok.tokenize(text2))
20
+ print(tok.tokenize(text3))
21
+ print(tok.tokenize(text4))
22
+ print(tok.tokenize(text5))
23
+
24
+
25
+ ids1 = tok.encode(text1)
26
+ ids2 = tok.encode(text2)
27
+ ids3 = tok.encode(text3)
28
+ ids4 = tok.encode(text4)
29
+ ids5 = tok.encode(text5)
30
+
31
+ print(ids1)
32
+ print(tok.decode(ids1))
33
+ print(tok.decode(ids1, skip_special_tokens=True))
34
+
35
+ print(ids2)
36
+ print(tok.decode(ids2))
37
+ print(tok.decode(ids2, skip_special_tokens=True))
38
+
39
+ print(ids3)
40
+ print(tok.decode(ids3))
41
+ print(tok.decode(ids3, skip_special_tokens=True))
42
+
43
+ ids4 = tok.encode(text4)
44
+ print(ids4)
45
+ print(tok.decode(ids4))
46
+ print(tok.decode(ids4, skip_special_tokens=True))
47
+
48
+ ids5 = tok.encode(text5)
49
+ print(ids5)
50
+ print(tok.decode(ids5))
51
+ print(tok.decode(ids5, skip_special_tokens=True))
Tokenizer/BPE/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/BPE/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:debb3ad91c0745bb304129ee9a0332a33c2bb1fffe7313d573608e5014ab69bb
3
+ size 747364
Tokenizer/BPE/tokenizer_config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<user>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<assistant>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<system>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ }
62
+ },
63
+ "additional_special_tokens": [
64
+ "<user>",
65
+ "<assistant>",
66
+ "<system>"
67
+ ],
68
+ "bos_token": "<s>",
69
+ "clean_up_tokenization_spaces": false,
70
+ "eos_token": "</s>",
71
+ "extra_special_tokens": {},
72
+ "legacy": true,
73
+ "model_max_length": 1000000000000000019884624838656,
74
+ "pad_token": "<pad>",
75
+ "sp_model_kwargs": {},
76
+ "spaces_between_special_tokens": false,
77
+ "tokenizer_class": "LlamaTokenizer",
78
+ "unk_token": "<unk>",
79
+ "use_default_system_prompt": false
80
+ }
Tokenizer/README.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tokenizer Module
2
+
3
+ This module handles all tokenization tasks for the Mini-LLM project, converting raw text into numerical tokens that the model can process.
4
+
5
+ ## Overview
6
+
7
+ The tokenizer uses **SentencePiece** with **Byte Pair Encoding (BPE)** to create a 32,000 token vocabulary. BPE is the same algorithm used by GPT-3, GPT-4, and LLaMA models.
8
+
9
+ ## Directory Structure
10
+
11
+ ```
12
+ Tokenizer/
13
+ ├── BPE/ # BPE tokenizer artifacts
14
+ │ ├── spm.model # Trained SentencePiece model
15
+ │ ├── spm.vocab # Vocabulary file
16
+ │ ├── tokenizer.json # HuggingFace format
17
+ │ ├── tokenizer_config.json
18
+ │ └── special_tokens_map.json
19
+ ├── Unigram/ # Unigram tokenizer (baseline)
20
+ │ └── ...
21
+ ├── train_spm_bpe.py # Train BPE tokenizer
22
+ ├── train_spm_unigram.py # Train Unigram tokenizer
23
+ └── convert_to_hf.py # Convert to HuggingFace format
24
+ ```
25
+
26
+ ## How It Works
27
+
28
+ ### 1. Training the Tokenizer
29
+
30
+ **Script**: `train_spm_bpe.py`
31
+
32
+ ```python
33
+ import sentencepiece as spm
34
+
35
+ spm.SentencePieceTrainer.Train(
36
+ input="data/raw/merged_text/corpus.txt",
37
+ model_prefix="Tokenizer/BPE/spm",
38
+ vocab_size=32000,
39
+ model_type="bpe",
40
+ byte_fallback=True, # Handles emojis, special chars
41
+ character_coverage=1.0,
42
+ user_defined_symbols=["<user>", "<assistant>", "<system>"]
43
+ )
44
+ ```
45
+
46
+ **What happens:**
47
+ 1. Reads raw text corpus
48
+ 2. Learns byte-pair merges (e.g., "th" + "e" → "the")
49
+ 3. Builds 32,000 most frequent tokens
50
+ 4. Saves model to `spm.model`
51
+
52
+ ### 2. Example: Tokenization Process
53
+
54
+ **Input Text:**
55
+ ```
56
+ "Hello world! <user> write code </s>"
57
+ ```
58
+
59
+ **Tokenization Steps:**
60
+
61
+ ```
62
+ ┌─────────────────────────────────────────┐
63
+ │ 1. Text Input │
64
+ │ "Hello world! <user> write code" │
65
+ └─────────────────────────────────────────┘
66
+
67
+ ┌─────────────────────────────────────────┐
68
+ │ 2. BPE Segmentation │
69
+ │ ['H', 'ello', '▁world', '!', │
70
+ │ '▁', '<user>', '▁write', '▁code'] │
71
+ └─────────────────────────────────────────┘
72
+
73
+ ┌─────────────────────────────────────────┐
74
+ │ 3. Token IDs │
75
+ │ [334, 3855, 288, 267, 2959, │
76
+ │ 354, 267, 12397] │
77
+ └─────────────────────────────────────────┘
78
+ ```
79
+
80
+ **Key Features:**
81
+ - `▁` represents space (SentencePiece convention)
82
+ - Special tokens like `<user>` are preserved
83
+ - Byte fallback handles emojis: 🔥 → `<0xF0><0x9F><0x94><0xA5>`
84
+
85
+ ### 3. Converting to HuggingFace Format
86
+
87
+ **Script**: `convert_to_hf.py`
88
+
89
+ ```python
90
+ from transformers import LlamaTokenizerFast
91
+
92
+ tokenizer = LlamaTokenizerFast(vocab_file="Tokenizer/BPE/spm.model")
93
+ tokenizer.add_special_tokens({
94
+ 'bos_token': '<s>',
95
+ 'eos_token': '</s>',
96
+ 'unk_token': '<unk>',
97
+ 'pad_token': '<pad>'
98
+ })
99
+ tokenizer.save_pretrained("Tokenizer/BPE")
100
+ ```
101
+
102
+ This creates `tokenizer.json` and config files compatible with HuggingFace Transformers.
103
+
104
+ ## Usage
105
+
106
+ ### Load Tokenizer
107
+
108
+ ```python
109
+ from transformers import AutoTokenizer
110
+
111
+ tokenizer = AutoTokenizer.from_pretrained("Tokenizer/BPE")
112
+ ```
113
+
114
+ ### Encode Text
115
+
116
+ ```python
117
+ text = "Hello world!"
118
+ ids = tokenizer.encode(text)
119
+ # Output: [1, 334, 3855, 288, 267, 2]
120
+ # [<s>, H, ello, ▁world, !, </s>]
121
+ ```
122
+
123
+ ### Decode IDs
124
+
125
+ ```python
126
+ decoded = tokenizer.decode(ids)
127
+ # Output: "<s> Hello world! </s>"
128
+
129
+ decoded = tokenizer.decode(ids, skip_special_tokens=True)
130
+ # Output: "Hello world!"
131
+ ```
132
+
133
+ ## BPE vs Unigram
134
+
135
+ | Feature | BPE | Unigram |
136
+ |---------|-----|---------|
137
+ | **Algorithm** | Merge frequent pairs | Probabilistic segmentation |
138
+ | **Emoji Handling** | ✅ Byte fallback | ❌ Creates `<unk>` |
139
+ | **URL Handling** | ✅ Clean splits | ⚠️ Unstable |
140
+ | **Used By** | GPT-3, GPT-4, LLaMA | BERT, T5 |
141
+ | **Recommendation** | ✅ **Primary** | Baseline only |
142
+
143
+ ## Vocabulary Statistics
144
+
145
+ - **Total Tokens**: 32,000
146
+ - **Special Tokens**: 4 (`<s>`, `</s>`, `<unk>`, `<pad>`)
147
+ - **User-Defined**: 3 (`<user>`, `<assistant>`, `<system>`)
148
+ - **Coverage**: 100% (byte fallback ensures no `<unk>`)
149
+
150
+ ## Performance
151
+
152
+ - **Compression Ratio**: ~3.5 bytes/token (English text)
153
+ - **Tokenization Speed**: ~1M tokens/second
154
+ - **Vocab Usage**: ~70% of tokens used in typical corpus
155
+
156
+ ## References
157
+
158
+ - [SentencePiece Documentation](https://github.com/google/sentencepiece)
159
+ - [BPE Paper (Sennrich et al., 2016)](https://arxiv.org/abs/1508.07909)
160
+ - [Tokenizer Comparison Report](../tokenizer_report.md)
Tokenizer/Unigram/special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<user>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<assistant>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<system>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
+ "bos_token": {
26
+ "content": "<s>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "eos_token": {
33
+ "content": "</s>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "unk_token": {
47
+ "content": "<unk>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ }
53
+ }
Tokenizer/Unigram/spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:247ddb0f3561179d04614a59d7eb594da59ad881575a6d3860f859be9b709508
3
+ size 768238
Tokenizer/Unigram/spm.vocab ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/Unigram/test_tokenizer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tok = AutoTokenizer.from_pretrained("./Tokenizer/Unigram")
3
+
4
+
5
+ text1 = "Hello world! <user> write code </s>"
6
+ text2 = "myHTTPRequestHandler is calling process_payment_v2"
7
+ text3 = "methylphenidate hydrochloride dopamine reuptake modulation"
8
+ text4 = "hello 🔥🔥🔥💀💀"
9
+ text5 = "https://github.com/Avinash-MiniLLM?tab=repos"
10
+
11
+
12
+ print(text1)
13
+ print(text2)
14
+ print(text3)
15
+ print(text4)
16
+ print(text5)
17
+
18
+ print(tok.tokenize(text1))
19
+ print(tok.tokenize(text2))
20
+ print(tok.tokenize(text3))
21
+ print(tok.tokenize(text4))
22
+ print(tok.tokenize(text5))
23
+
24
+
25
+ ids1 = tok.encode(text1)
26
+ ids2 = tok.encode(text2)
27
+ ids3 = tok.encode(text3)
28
+ ids4 = tok.encode(text4)
29
+ ids5 = tok.encode(text5)
30
+
31
+ print(ids1)
32
+ print(tok.decode(ids1))
33
+ print(tok.decode(ids1, skip_special_tokens=True))
34
+
35
+ print(ids2)
36
+ print(tok.decode(ids2))
37
+ print(tok.decode(ids2, skip_special_tokens=True))
38
+
39
+ print(ids3)
40
+ print(tok.decode(ids3))
41
+ print(tok.decode(ids3, skip_special_tokens=True))
42
+
43
+ ids4 = tok.encode(text4)
44
+ print(ids4)
45
+ print(tok.decode(ids4))
46
+ print(tok.decode(ids4, skip_special_tokens=True))
47
+
48
+ ids5 = tok.encode(text5)
49
+ print(ids5)
50
+ print(tok.decode(ids5))
51
+ print(tok.decode(ids5, skip_special_tokens=True))
Tokenizer/Unigram/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
Tokenizer/Unigram/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:247ddb0f3561179d04614a59d7eb594da59ad881575a6d3860f859be9b709508
3
+ size 768238
Tokenizer/Unigram/tokenizer_config.json ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<user>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<assistant>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<system>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ }
62
+ },
63
+ "additional_special_tokens": [
64
+ "<user>",
65
+ "<assistant>",
66
+ "<system>"
67
+ ],
68
+ "bos_token": "<s>",
69
+ "clean_up_tokenization_spaces": false,
70
+ "eos_token": "</s>",
71
+ "extra_special_tokens": {},
72
+ "legacy": true,
73
+ "model_max_length": 1000000000000000019884624838656,
74
+ "pad_token": "<pad>",
75
+ "sp_model_kwargs": {},
76
+ "spaces_between_special_tokens": false,
77
+ "tokenizer_class": "LlamaTokenizer",
78
+ "unk_token": "<unk>",
79
+ "use_default_system_prompt": false
80
+ }
Tokenizer/convert_to_hf.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import LlamaTokenizerFast
2
+
3
+ # Load the raw spm model
4
+ tokenizer = LlamaTokenizerFast(vocab_file="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm.model")
5
+
6
+ # Add your special tokens manually to the HF config part
7
+ tokenizer.add_special_tokens({
8
+ "bos_token": "<s>",
9
+ "eos_token": "</s>",
10
+ "unk_token": "<unk>",
11
+ "pad_token": "<pad>",
12
+ "additional_special_tokens": ["<user>", "<assistant>", "<system>"]
13
+ })
14
+
15
+ # Save the json version
16
+ tokenizer.save_pretrained("Tokenizer/")
17
+
18
+ print("Converted to tokenizer.json successfully!")
Tokenizer/test_tokenizer.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tok = AutoTokenizer.from_pretrained(".")
3
+ print(tok.tokenize("Hello world! <user> write code </s>"))
4
+
5
+ text = "Hello world! <user> write code </s>"
6
+ ids = tok.encode(text)
7
+ print(ids)
8
+ print(tok.decode(ids))
9
+ print(tok.decode(ids, skip_special_tokens=True))
Tokenizer/train_spm_bpe.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+
3
+ spm.SentencePieceTrainer.Train(
4
+ input="/home/aviinashh/projects/Mini-LLM/data/raw/merged_text/corpus.txt",
5
+ model_prefix="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm",
6
+ vocab_size=32000,
7
+ model_type="bpe",
8
+ byte_fallback=True,
9
+ character_coverage=1.0,
10
+ unk_id=0,
11
+ bos_id=1,
12
+ eos_id=2,
13
+ pad_id=3,
14
+ user_defined_symbols=["<user>", "<assistant>", "<system>"],
15
+ )
16
+
17
+ print("Tokenizer trained!")
18
+ # Model and vocab will be saved as spm.model and spm.vocab in the specified path
Tokenizer/train_spm_unigram.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+
3
+ spm.SentencePieceTrainer.Train(
4
+ input="/home/aviinashh/projects/Mini-LLM/data/raw/merged_text/corpus.txt",
5
+ model_prefix="/home/aviinashh/projects/Mini-LLM/Tokenizer/spm",
6
+ vocab_size=32000,
7
+ model_type="unigram",
8
+ character_coverage=1.0,
9
+ unk_id=0,
10
+ bos_id=1,
11
+ eos_id=2,
12
+ pad_id=3,
13
+ user_defined_symbols=["<user>", "<assistant>", "<system>"],
14
+ )
15
+
16
+ print("Tokenizer trained!")
17
+ # Model and vocab will be saved as spm.model and spm.vocab in the specified path