← All posts

Model glossary run 02 - Unified stem frequency table

Run 02 - Unified stem frequency table (all IDs tokenized)

Source code: model_glossary_run_02_stems.py
Raw output: model_glossary_run_02_output.txt

Code

import json, re
from collections import Counter
DATA=".../data"

strings=[]
oll=json.load(open(f"{DATA}/ollama_models.json"))['models']
for mdl in oll:
    for key in ('tags','detailed_tags'):
        for t in mdl.get(key,[]):
            if t.get('full_tag'): strings.append(t['full_tag'])
orr=json.load(open(f"{DATA}/openrouter_models.json"))['data']
for m in orr:
    for k in ('id','canonical_slug','hugging_face_id'):
        if m.get(k): strings.append(m[k])
aa=json.load(open(f"{DATA}/artificialanalysis_benchmark_data.json"))['data']
for m in aa:
    if m.get('slug'): strings.append(m['slug'])

# drop CSS scraping-artifact tags (contain ':' style props baked into full_tag)
CSS=('padding','margin','font','border','width','px','weight','collapse','align','center','left','bottom','top','size','100','family','apple','system','mailto')
def is_css(s):
    return any(c in s.lower() for c in ('padding','margin','font-','border-','-width','px;','weight:','collapse')) or re.search(r'\d+px', s)
clean=[s for s in strings if not is_css(s)]

tok=Counter()
for s in clean:
    for p in re.split(r'[\/:\-_\.\s]+', s):
        if p: tok[p.lower()]+=1

special=Counter()
for s in clean:
    for m in re.findall(r'(?i)\b(?:Q\d+_?K?(?:_[SMLX]+)?|Q\d+_\d+|FP\d+|INT\d+|NVFP\d+|MXFP\d+|BF\d+|F\d+|MLX|GGUF|GPTQ|AWQ|QAT|MTP|MOE|A\d+B|\d+x\d+b)\b', s):
        special[m.upper()]+=1

print(f"total identifier strings: {len(strings)}  (after CSS filter: {len(clean)})")
print(f"distinct tokens: {len(tok)}")
print()
print("== TOKENS BY FREQUENCY (count>=2) ==")
for t,c in tok.most_common():
    if c>=2: print(f"{c:6d}  {t}")
print()
print("== SINGLETON TOKENS (count==1) ==")
print(", ".join(sorted([t for t,c in tok.items() if c==1])))
print()
print("== SPECIAL / ARCHITECTURAL TOKENS ==")
for t,c in special.most_common():
    print(f"{c:6d}  {t}")

Output

total identifier strings: 9548  (after CSS filter: 9526)
distinct tokens: 719

== TOKENS BY FREQUENCY (count>=2) ==
  3626  k
  2102  1
  1867  7b
  1735  q4
  1731  instruct
  1573  0
  1552  q5
  1468  5
  1317  m
  1174  s
  1153  q3
   788  v1
   746  chat
   740  8b
   716  v2
   622  text
   613  2
   612  13b
   566  q8
   560  qwen
   541  mistral
   538  qwen2
   509  fp16
   508  70b
   503  coder
   502  llama3
   495  3
   488  qwen3
   461  deepseek
   446  latest
   426  base
   390  llama2
   389  q2
   389  v0
   387  l
   384  q6
   378  3b
   355  code
   303  6
   294  5b
   272  mini
   271  dolphin
   263  yi
   258  34b
   257  codellama
   247  llama
   228  4
   223  2b
   217  vicuna
   207  32b
   200  14b
   198  granite
   196  v3
   187  gpt
   186  9b
   185  1b
   182  granite3
   174  pro
   174  wizardlm
   171  mixtral
   168  72b
   150  gemma
   146  30b
   143  orca
   142  6b
   141  4b
   140  wizard
   130  python
   129  openai
   126  reasoning
   124  math
   123  nous
   120  15b
   119  a3b
   119  8x7b
   118  nemotron
   116  27b
   114  llava
   112  vl
   111  uncensored
   107  yarn
   105  flash
   105  starcoder
   104  starcoder2
   103  phi3
   101  preview
   100  128k
   100  granite4
    99  gemma2
    98  smollm
    97  it
    96  12b
    93  thinking
    92  35b
    91  command
    91  stable
    89  codegemma
    87  20b
    87  hermes
    87  stablelm2
    86  cloud
    83  xwinlm
    81  7
    80  bf16
    79  zephyr
    78  solar
    76  r1
    76  120b
    76  33b
    75  lite
    75  gemini
    74  large
    74  moe
    74  lm
    73  aya
    72  openhermes
    72  dense
    72  claude
    70  hermes3
    70  internlm2
    69  glm
    69  small
    69  wizardcoder
    68  medium
    68  llm
    67  plus
    66  8x22b
    65  9
    63  r
    62  405b
    61  google
    60  gemma4
    60  24b
    57  2024
    56  236b
    56  hermes2
    55  minimax
    54  smollm2
    54  v4
    53  openchat
    53  beluga
    53  shieldgemma
    52  neural
    52  ai
    51  sqlcoder
    51  10
    51  samantha
    51  phind
    51  nano
    50  minicpm
    50  22b
    50  alpha
    50  16k
    49  235b
    49  135m
    49  360m
    49  16b
    49  nvidia
    49  mistralai
    47  phi
    47  64k
    45  olmo
    43  falcon
    43  meta
    42  gemma3
    41  mlx
    41  08
    40  123b
    40  2025
    40  non
    39  a22b
    39  k2
    39  chinese
    38  tinyllama
    38  kimi
    38  cogito
    38  8
    38  chatqa
    38  dolphincoder
    38  gradient
    38  starling
    37  2507
    36  671b
    36  embed
    36  oss
    36  next
    36  guard3
    36  groq
    36  tool
    36  use
    36  expanse
    36  orca2
    36  reader
    35  ministral
    34  110b
    34  2407
    34  beta
    34  glm4
    34  nexusraven
    33  phi4
    33  lfm2
    33  devstral
    33  goliath
    32  distill
    32  4k
    32  dpo
    32  67b
    32  codeqwen
    32  anthropic
    32  opus
    31  23
    31  1048k
    30  guardian
    29  2512
    28  all
    28  arctic
    28  grok
    27  snowflake
    26  11b
    26  m2
    26  wizardlm2
    26  think
    25  embedding
    25  exaone
    25  meditron
    24  vision
    24  super
    24  h
    24  40b
    24  deep
    24  free
    24  nova
    24  4o
    23  e4b
    22  31b
    22  nemo
    22  falcon3
    22  5vl
    22  80b
    21  codestral
    21  104b
    21  codeup
    21  megadolphin
    20  moondream
    20  tinydolphin
    20  everythinglm
    20  magicoder
    20  notus
    20  notux
    20  duckdb
    20  nsql
    20  open
    20  sonnet
    20  z
    20  turbo
    20  seed
    19  26b
    19  e2b
    19  mxfp8
    19  nvfp4
    19  v
    19  xs
    19  bakllava
    19  codegeex4
    19  openorca
    19  reflection
    19  athene
    19  medllama2
    19  mathstral
    19  falcon2
    19  stablelm
    19  nuextract
    19  bespoke
    19  minicheck
    19  mistrallite
    19  firefunction
    19  platypus2
    19  laguna
    18  256k
    18  openthinker
    18  codebooga
    18  moonshotai
    18  mimo
    17  translategemma
    17  1m
    17  exaone3
    17  sailor2
    17  max
    17  codex
    16  qat
    16  llama4
    16  laser
    16  200k
    16  2411
    16  0106
    16  1210
    16  cl
    16  r7b
    16  aion
    15  2409
    15  8k
    15  o1
    15  04
    15  nousresearch
    14  small3
    14  sonar
    14  o3
    13  coding
    13  minilm
    13  qwq
    13  1776
    13  tulu3
    13  fast
    13  07
    12  a4b
    12  480b
    12  olmo2
    12  gemma3n
    12  deepcoder
    12  opencoder
    12  rnj
    12  a
    12  03
    12  medgemma
    12  openrouter
    12  omni
    12  haiku
    12  image
    12  org
    12  thedrummer
    12  sao10k
    12  l3
    12  low
    11  270m
    11  m3
    11  ocr
    11  a12b
    11  magistral
    11  micro
    11  ultra
    11  bytedance
    10  bge
    10  17b
    10  dbrx
    10  1t
    10  step
    10  arcee
    10  20
    10  amazon
    10  perplexity
    10  06
    10  jamba
     9  0528
     9  122b
     9  335m
     9  scout
     9  maverick
     9  alfred
     9  safeguard
     9  x
     9  ling
     9  zai
     9  microsoft
     9  28
     8  nomic
     8  a35b
     8  2506
     8  350m
     8  terminus
     8  02
     8  ~anthropic
     8  reka
     8  labs
     8  relace
     8  search
     8  morph
     8  05
     8  high
     8  cohere
     8  inflection
     7  mtp
     7  397b
     7  a10b
     7  dolphin3
     7  embeddinggemma
     7  deepscaler
     7  a1b
     7  141b
     7  12
     7  smallthinker
     7  marco
     7  arabic
     7  medgemma1
     7  550b
     7  a55b
     7  trinity
     7  20260224
     7  lfm
     7  o4
     7  49b
     7  25
     6  mxbai
     6  90b
     6  1124
     6  33m
     6  22m
     6  2501
     6  10b
     6  180b
     6  278m
     6  132b
     6  functiongemma
     6  cascade
     6  nex
     6  minimaxai
     6  stepfun
     6  inclusionai
     6  ibm
     6  tencent
     6  xiaomi
     6  rekaai
     6  liquid
     6  intellect
     6  research
     6  09
     6  5v
     6  3n
     6  01
     6  euryale
     6  l2
     5  137m
     5  128e
     5  16e
     5  300m
     5  a2b
     5  paraphrase
     5  multilingual
     5  2503
     5  nemotron3
     5  embed2
     5  1023
     5  111b
     5  128b
     5  fable
     5  ring
     5  poolside
     5  20260423
     5  hy3
     5  2603
     5  a17b
     5  exp
     5  2509
     5  ernie
     4  2505
     4  30m
     4  perceptron
     4  20260421
     4  ~openai
     4  ~google
     4  20260420
     4  lyria
     4  kat
     4  audio
     4  6v
     4  20251113
     4  0905
     4  air
     4  a47b
     4  m1
     4  0324
     4  11
     4  36b
     4  minimal
     4  35
     3  567m
     3  l12
     3  l6
     3  110m
     3  tiny
     3  568m
     3  en
     3  agi
     3  n2
     3  content
     3  safety
     3  20260604
     3  20260528
     3  20260422
     3  xiaomimimo
     3  20260309
     3  edge
     3  fp8
     3  mercury
     3  liquidai
     3  20251211
     3  essentialai
     3  allenai
     3  premier
     3  voxtral
     3  26
     3  cydonia
     3  ai21
     3  ui
     3  tars
     3  venice
     3  hunyuan
     3  a13b
     3  baidu
     3  424b
     3  guard
     3  16
     3  14
     3  skyfall
     3  saba
     3  hanami
     3  x1
     3  unslopnemo
     3  anthracite
     3  magnum
     3  rocinante
     3  lunaris
     3  13
     3  0613
     3  undi95
     3  remm
     3  slerp
     3  gryphe
     3  mythomax
     3  sarvam
     3  chatgpt
     2  align
     2  int4
     2  int8
     2  long
     2  16x17b
     2  128x17b
     2  a9b
     2  675b
     2  f16
     2  fusion
     2  20260520
     2  build
     2  20260512
     2  mk1
     2  20260430
     2  owl
     2  ~moonshotai
     2  pareto
     2  20260403
     2  20260402
     2  multi
     2  agent
     2  20260330
     2  clip
     2  kwaipilot
     2  20260317
     2  20230311
     2  20260305
     2  inception
     2  20260303
     2  customtools
     2  20260219
     2  20260216
     2  20260211
     2  20260123
     2  upstage
     2  her
     2  writer
     2  palmyra
     2  x5
     2  20260120
     2  20250625
     2  20251208
     2  bodybuilder
     2  20251201
     2  prime
     2  deepcogito
     2  apply
     2  2508
     2  switchpoint
     2  router
     2  cognitivecomputations
     2  edition
     2  20250522
     2  virtuoso
     2  31
     2  rp
     2  productivity
     2  pi
     2  18
     2  auto
     2  mancer
     2  weaver
     2  instant
     2  adaptive
     2  deephermes
     2  tri
     2  21b
     2  apertus
     2  minicpm5
     2  jt
     2  apriel
     2  thinker
     2  realtime
     2  dec
     2  experimental
     2  may
     2  0925
     2  0309

== SINGLETON TOKENS (count==1) ==
001, 0120, 0121, 0127, 0202, 0206, 0327, 0424, 0520, 100, 100b, 105b, 119b, 1212, 1219, 15, 17, 20250428, 20250805, 20250929, 20251001, 20251106, 20251118, 20251120, 20251121, 20251124, 20251126, 20251204, 20251210, 20251217, 20251222, 20260114, 20260119, 20260205, 20260217, 20260223, 20260226, 20260304, 20260310, 20260312, 20260315, 20260318, 20260327, 20260406, 20260407, 20260415, 20260416, 20260428, 20260429, 20260505, 20260507, 20260508, 20260519, 20260531, 20260602, 20260609, 20260612, 21, 24, 2402, 250, 2502, 253b, 300b, 40k, 48b, 65b, 80k, ai21labs, c4ai, center, cohereforai, d, dec28, deepseekr1, dm, doubao, effort, fuseai, fuseo1, global, h1r, hyperclova, june, left, linear, longcat, mailto, mi, midm, molmo, molmo2, motif, mpnet, multimodal, muse, nanbeige4, north, palm, pixtral, primeintellect, pt, qwen1, rsnsft, security, sep, skyt1, spark, speciale, width

== SPECIAL / ARCHITECTURAL TOKENS ==
   565  Q8_0
   544  Q4_K_M
   509  FP16
   410  Q4_0
   394  Q4_1
   394  Q5_0
   392  Q5_1
   389  Q2_K
   387  Q4_K_S
   385  Q3_K_S
   384  Q3_K_M
   384  Q3_K_L
   384  Q5_K_S
   384  Q6_K
   382  Q5_K_M
   119  A3B
   119  8X7B
    80  BF16
    74  MOE
    66  8X22B
    41  MLX
    39  A22B
    19  MXFP8
    19  NVFP4
    16  QAT
    12  A4B
    11  A12B
     8  A35B
     7  MTP
     7  A10B
     7  A1B
     7  A55B
     5  A2B
     5  A17B
     4  A47B
     3  FP8
     3  A13B
     2  INT4
     2  INT8
     2  16X17B
     2  128X17B
     2  A9B
     2  F16
     1  Q8

Discussion

← All posts