Run 02 - Unified stem frequency table (all IDs tokenized)
Source code: model_glossary_run_02_stems.py
Raw output: model_glossary_run_02_output.txt
Code
import json, re
from collections import Counter
DATA=".../data"
strings=[]
oll=json.load(open(f"{DATA}/ollama_models.json"))['models']
for mdl in oll:
for key in ('tags','detailed_tags'):
for t in mdl.get(key,[]):
if t.get('full_tag'): strings.append(t['full_tag'])
orr=json.load(open(f"{DATA}/openrouter_models.json"))['data']
for m in orr:
for k in ('id','canonical_slug','hugging_face_id'):
if m.get(k): strings.append(m[k])
aa=json.load(open(f"{DATA}/artificialanalysis_benchmark_data.json"))['data']
for m in aa:
if m.get('slug'): strings.append(m['slug'])
# drop CSS scraping-artifact tags (contain ':' style props baked into full_tag)
CSS=('padding','margin','font','border','width','px','weight','collapse','align','center','left','bottom','top','size','100','family','apple','system','mailto')
def is_css(s):
return any(c in s.lower() for c in ('padding','margin','font-','border-','-width','px;','weight:','collapse')) or re.search(r'\d+px', s)
clean=[s for s in strings if not is_css(s)]
tok=Counter()
for s in clean:
for p in re.split(r'[\/:\-_\.\s]+', s):
if p: tok[p.lower()]+=1
special=Counter()
for s in clean:
for m in re.findall(r'(?i)\b(?:Q\d+_?K?(?:_[SMLX]+)?|Q\d+_\d+|FP\d+|INT\d+|NVFP\d+|MXFP\d+|BF\d+|F\d+|MLX|GGUF|GPTQ|AWQ|QAT|MTP|MOE|A\d+B|\d+x\d+b)\b', s):
special[m.upper()]+=1
print(f"total identifier strings: {len(strings)} (after CSS filter: {len(clean)})")
print(f"distinct tokens: {len(tok)}")
print()
print("== TOKENS BY FREQUENCY (count>=2) ==")
for t,c in tok.most_common():
if c>=2: print(f"{c:6d} {t}")
print()
print("== SINGLETON TOKENS (count==1) ==")
print(", ".join(sorted([t for t,c in tok.items() if c==1])))
print()
print("== SPECIAL / ARCHITECTURAL TOKENS ==")
for t,c in special.most_common():
print(f"{c:6d} {t}")
Output
total identifier strings: 9548 (after CSS filter: 9526)
distinct tokens: 719
== TOKENS BY FREQUENCY (count>=2) ==
3626 k
2102 1
1867 7b
1735 q4
1731 instruct
1573 0
1552 q5
1468 5
1317 m
1174 s
1153 q3
788 v1
746 chat
740 8b
716 v2
622 text
613 2
612 13b
566 q8
560 qwen
541 mistral
538 qwen2
509 fp16
508 70b
503 coder
502 llama3
495 3
488 qwen3
461 deepseek
446 latest
426 base
390 llama2
389 q2
389 v0
387 l
384 q6
378 3b
355 code
303 6
294 5b
272 mini
271 dolphin
263 yi
258 34b
257 codellama
247 llama
228 4
223 2b
217 vicuna
207 32b
200 14b
198 granite
196 v3
187 gpt
186 9b
185 1b
182 granite3
174 pro
174 wizardlm
171 mixtral
168 72b
150 gemma
146 30b
143 orca
142 6b
141 4b
140 wizard
130 python
129 openai
126 reasoning
124 math
123 nous
120 15b
119 a3b
119 8x7b
118 nemotron
116 27b
114 llava
112 vl
111 uncensored
107 yarn
105 flash
105 starcoder
104 starcoder2
103 phi3
101 preview
100 128k
100 granite4
99 gemma2
98 smollm
97 it
96 12b
93 thinking
92 35b
91 command
91 stable
89 codegemma
87 20b
87 hermes
87 stablelm2
86 cloud
83 xwinlm
81 7
80 bf16
79 zephyr
78 solar
76 r1
76 120b
76 33b
75 lite
75 gemini
74 large
74 moe
74 lm
73 aya
72 openhermes
72 dense
72 claude
70 hermes3
70 internlm2
69 glm
69 small
69 wizardcoder
68 medium
68 llm
67 plus
66 8x22b
65 9
63 r
62 405b
61 google
60 gemma4
60 24b
57 2024
56 236b
56 hermes2
55 minimax
54 smollm2
54 v4
53 openchat
53 beluga
53 shieldgemma
52 neural
52 ai
51 sqlcoder
51 10
51 samantha
51 phind
51 nano
50 minicpm
50 22b
50 alpha
50 16k
49 235b
49 135m
49 360m
49 16b
49 nvidia
49 mistralai
47 phi
47 64k
45 olmo
43 falcon
43 meta
42 gemma3
41 mlx
41 08
40 123b
40 2025
40 non
39 a22b
39 k2
39 chinese
38 tinyllama
38 kimi
38 cogito
38 8
38 chatqa
38 dolphincoder
38 gradient
38 starling
37 2507
36 671b
36 embed
36 oss
36 next
36 guard3
36 groq
36 tool
36 use
36 expanse
36 orca2
36 reader
35 ministral
34 110b
34 2407
34 beta
34 glm4
34 nexusraven
33 phi4
33 lfm2
33 devstral
33 goliath
32 distill
32 4k
32 dpo
32 67b
32 codeqwen
32 anthropic
32 opus
31 23
31 1048k
30 guardian
29 2512
28 all
28 arctic
28 grok
27 snowflake
26 11b
26 m2
26 wizardlm2
26 think
25 embedding
25 exaone
25 meditron
24 vision
24 super
24 h
24 40b
24 deep
24 free
24 nova
24 4o
23 e4b
22 31b
22 nemo
22 falcon3
22 5vl
22 80b
21 codestral
21 104b
21 codeup
21 megadolphin
20 moondream
20 tinydolphin
20 everythinglm
20 magicoder
20 notus
20 notux
20 duckdb
20 nsql
20 open
20 sonnet
20 z
20 turbo
20 seed
19 26b
19 e2b
19 mxfp8
19 nvfp4
19 v
19 xs
19 bakllava
19 codegeex4
19 openorca
19 reflection
19 athene
19 medllama2
19 mathstral
19 falcon2
19 stablelm
19 nuextract
19 bespoke
19 minicheck
19 mistrallite
19 firefunction
19 platypus2
19 laguna
18 256k
18 openthinker
18 codebooga
18 moonshotai
18 mimo
17 translategemma
17 1m
17 exaone3
17 sailor2
17 max
17 codex
16 qat
16 llama4
16 laser
16 200k
16 2411
16 0106
16 1210
16 cl
16 r7b
16 aion
15 2409
15 8k
15 o1
15 04
15 nousresearch
14 small3
14 sonar
14 o3
13 coding
13 minilm
13 qwq
13 1776
13 tulu3
13 fast
13 07
12 a4b
12 480b
12 olmo2
12 gemma3n
12 deepcoder
12 opencoder
12 rnj
12 a
12 03
12 medgemma
12 openrouter
12 omni
12 haiku
12 image
12 org
12 thedrummer
12 sao10k
12 l3
12 low
11 270m
11 m3
11 ocr
11 a12b
11 magistral
11 micro
11 ultra
11 bytedance
10 bge
10 17b
10 dbrx
10 1t
10 step
10 arcee
10 20
10 amazon
10 perplexity
10 06
10 jamba
9 0528
9 122b
9 335m
9 scout
9 maverick
9 alfred
9 safeguard
9 x
9 ling
9 zai
9 microsoft
9 28
8 nomic
8 a35b
8 2506
8 350m
8 terminus
8 02
8 ~anthropic
8 reka
8 labs
8 relace
8 search
8 morph
8 05
8 high
8 cohere
8 inflection
7 mtp
7 397b
7 a10b
7 dolphin3
7 embeddinggemma
7 deepscaler
7 a1b
7 141b
7 12
7 smallthinker
7 marco
7 arabic
7 medgemma1
7 550b
7 a55b
7 trinity
7 20260224
7 lfm
7 o4
7 49b
7 25
6 mxbai
6 90b
6 1124
6 33m
6 22m
6 2501
6 10b
6 180b
6 278m
6 132b
6 functiongemma
6 cascade
6 nex
6 minimaxai
6 stepfun
6 inclusionai
6 ibm
6 tencent
6 xiaomi
6 rekaai
6 liquid
6 intellect
6 research
6 09
6 5v
6 3n
6 01
6 euryale
6 l2
5 137m
5 128e
5 16e
5 300m
5 a2b
5 paraphrase
5 multilingual
5 2503
5 nemotron3
5 embed2
5 1023
5 111b
5 128b
5 fable
5 ring
5 poolside
5 20260423
5 hy3
5 2603
5 a17b
5 exp
5 2509
5 ernie
4 2505
4 30m
4 perceptron
4 20260421
4 ~openai
4 ~google
4 20260420
4 lyria
4 kat
4 audio
4 6v
4 20251113
4 0905
4 air
4 a47b
4 m1
4 0324
4 11
4 36b
4 minimal
4 35
3 567m
3 l12
3 l6
3 110m
3 tiny
3 568m
3 en
3 agi
3 n2
3 content
3 safety
3 20260604
3 20260528
3 20260422
3 xiaomimimo
3 20260309
3 edge
3 fp8
3 mercury
3 liquidai
3 20251211
3 essentialai
3 allenai
3 premier
3 voxtral
3 26
3 cydonia
3 ai21
3 ui
3 tars
3 venice
3 hunyuan
3 a13b
3 baidu
3 424b
3 guard
3 16
3 14
3 skyfall
3 saba
3 hanami
3 x1
3 unslopnemo
3 anthracite
3 magnum
3 rocinante
3 lunaris
3 13
3 0613
3 undi95
3 remm
3 slerp
3 gryphe
3 mythomax
3 sarvam
3 chatgpt
2 align
2 int4
2 int8
2 long
2 16x17b
2 128x17b
2 a9b
2 675b
2 f16
2 fusion
2 20260520
2 build
2 20260512
2 mk1
2 20260430
2 owl
2 ~moonshotai
2 pareto
2 20260403
2 20260402
2 multi
2 agent
2 20260330
2 clip
2 kwaipilot
2 20260317
2 20230311
2 20260305
2 inception
2 20260303
2 customtools
2 20260219
2 20260216
2 20260211
2 20260123
2 upstage
2 her
2 writer
2 palmyra
2 x5
2 20260120
2 20250625
2 20251208
2 bodybuilder
2 20251201
2 prime
2 deepcogito
2 apply
2 2508
2 switchpoint
2 router
2 cognitivecomputations
2 edition
2 20250522
2 virtuoso
2 31
2 rp
2 productivity
2 pi
2 18
2 auto
2 mancer
2 weaver
2 instant
2 adaptive
2 deephermes
2 tri
2 21b
2 apertus
2 minicpm5
2 jt
2 apriel
2 thinker
2 realtime
2 dec
2 experimental
2 may
2 0925
2 0309
== SINGLETON TOKENS (count==1) ==
001, 0120, 0121, 0127, 0202, 0206, 0327, 0424, 0520, 100, 100b, 105b, 119b, 1212, 1219, 15, 17, 20250428, 20250805, 20250929, 20251001, 20251106, 20251118, 20251120, 20251121, 20251124, 20251126, 20251204, 20251210, 20251217, 20251222, 20260114, 20260119, 20260205, 20260217, 20260223, 20260226, 20260304, 20260310, 20260312, 20260315, 20260318, 20260327, 20260406, 20260407, 20260415, 20260416, 20260428, 20260429, 20260505, 20260507, 20260508, 20260519, 20260531, 20260602, 20260609, 20260612, 21, 24, 2402, 250, 2502, 253b, 300b, 40k, 48b, 65b, 80k, ai21labs, c4ai, center, cohereforai, d, dec28, deepseekr1, dm, doubao, effort, fuseai, fuseo1, global, h1r, hyperclova, june, left, linear, longcat, mailto, mi, midm, molmo, molmo2, motif, mpnet, multimodal, muse, nanbeige4, north, palm, pixtral, primeintellect, pt, qwen1, rsnsft, security, sep, skyt1, spark, speciale, width
== SPECIAL / ARCHITECTURAL TOKENS ==
565 Q8_0
544 Q4_K_M
509 FP16
410 Q4_0
394 Q4_1
394 Q5_0
392 Q5_1
389 Q2_K
387 Q4_K_S
385 Q3_K_S
384 Q3_K_M
384 Q3_K_L
384 Q5_K_S
384 Q6_K
382 Q5_K_M
119 A3B
119 8X7B
80 BF16
74 MOE
66 8X22B
41 MLX
39 A22B
19 MXFP8
19 NVFP4
16 QAT
12 A4B
11 A12B
8 A35B
7 MTP
7 A10B
7 A1B
7 A55B
5 A2B
5 A17B
4 A47B
3 FP8
3 A13B
2 INT4
2 INT8
2 16X17B
2 128X17B
2 A9B
2 F16
1 Q8
Discussion