# /// script
# requires-python = ">=3.12"
# dependencies = [
# "torch==2.7.1",
# "transformers",
# ]
# ///
import torch
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", clean_up_tokenization_spaces=False)
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base").to("cpu")
toks = tokenizer("Shakespeare was a", return_tensors="pt").input_ids
toks
tensor([[ 2016, 36871, 572, 264]])
outputs = model(toks).logits
dist = torch.softmax(outputs, dim=2)
dist
tensor([[[1.7424e-06, 3.4825e-07, 7.9486e-06, ..., 1.2353e-10,
1.2353e-10, 1.2353e-10],
[8.6864e-05, 1.0547e-04, 5.8930e-06, ..., 3.1927e-10,
3.1927e-10, 3.1927e-10],
[8.9280e-06, 1.4695e-05, 3.0290e-07, ..., 8.7199e-11,
8.7199e-11, 8.7199e-11],
[7.2593e-07, 4.1453e-05, 1.0159e-07, ..., 1.3909e-10,
1.3909e-10, 1.3909e-10]]], grad_fn=<SoftmaxBackward0>)
torch.argmax(dist, dim=2)
tensor([[5059, 594, 9223, 2244]])
tokenizer.decode(5059), tokenizer.decode(594), tokenizer.decode(9223), tokenizer.decode(2244)
('ara', "'s", ' born', ' great')
Qwen3ForCausalLM(
(model): Qwen3Model(
(embed_tokens): Embedding(151936, 1024)
(layers): ModuleList(
(0-27): 28 x Qwen3DecoderLayer(
(self_attn): Qwen3Attention(
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
(q_norm): Qwen3RMSNorm((128,), eps=1e-06)
(k_norm): Qwen3RMSNorm((128,), eps=1e-06)
)
(mlp): Qwen3MLP(
(gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
(up_proj): Linear(in_features=1024, out_features=3072, bias=False)
(down_proj): Linear(in_features=3072, out_features=1024, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
(post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
)
)
(norm): Qwen3RMSNorm((1024,), eps=1e-06)
(rotary_emb): Qwen3RotaryEmbedding()
)
(lm_head): Linear(in_features=1024, out_features=151936, bias=False)
)
model.model.embed_tokens(toks)
tensor([[[ 0.0344, 0.0121, -0.0186, ..., -0.0427, -0.0057, 0.0283],
[-0.0222, -0.0203, 0.0359, ..., 0.0311, -0.0052, 0.0056],
[ 0.0115, -0.0002, -0.0344, ..., -0.0059, 0.0221, -0.0106],
[-0.0240, 0.0259, -0.0601, ..., 0.0250, 0.0088, 0.0332]]],
grad_fn=<EmbeddingBackward0>)
tokenizer(["Shakespeare was a", "Shakespeare was born"], return_tensors="pt").input_ids
tensor([[ 2016, 36871, 572, 264],
[ 2016, 36871, 572, 9223]])
tokenizer(["Shakespeare was a", "Shakespeare was born in"], return_tensors="pt", padding=True).input_ids
tensor([[ 2016, 36871, 572, 264, 151643],
[ 2016, 36871, 572, 9223, 304]])
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", clean_up_tokenization_spaces=False, padding_side="left")
bad_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base", clean_up_tokenization_spaces=False, padding_side="right")
tokenizer(["Shakespeare was a", "Shakespeare was born in"], return_tensors="pt", padding=True).input_ids
tensor([[151643, 2016, 36871, 572, 264],
[ 2016, 36871, 572, 9223, 304]])
logits= model(tokenizer(["Shakespeare was a", "Shakespeare was born in", "Northeastern"], return_tensors="pt", padding=True).input_ids).logits
dist = F.softmax(logits, dim=2)
torch.argmax(dist, dim=2)[:, -1]
tensor([2244, 220, 3822])
tokenizer.decode(2244), tokenizer.decode(220), tokenizer.decode(3822)
(' great', ' ', ' University')
logits= model(bad_tokenizer(["Shakespeare was a", "Shakespeare was born in", "Northeastern"], return_tensors="pt", padding=True).input_ids).logits
dist = F.softmax(logits, dim=2)
torch.argmax(dist, dim=2)[:, -1]
tokenizer.decode(33975), tokenizer.decode(220), tokenizer.decode(2)
A = torch.tensor([[1, -1, 2]])
B = torch.tensor([[0, 3, 1]])
C = torch.tensor([[1, 2, 3],
[4, 5, 6]])
C
tensor([[1, 2, 3],
[4, 5, 6]])
tensor([[2, 1, 5],
[5, 4, 8]])
X = torch.tensor([[1, 2], [3, 4]])
X
tensor([[[[-0.0181, -0.9873, -0.9765, 1.1980],
[ 0.4459, 0.8025, 0.1952, 0.7316]],
[[ 0.0646, -0.3129, -0.2246, -0.4263],
[ 1.7395, 0.3989, -0.1349, 0.9651]]],
[[[ 0.8100, 0.2038, 0.7367, 0.0521],
[-0.6221, -0.0254, -0.4380, 0.5722]],
[[-0.5178, -1.2201, 0.4104, 0.2367],
[-0.4490, -0.3786, -1.2423, -0.0383]]]])