# Non-Neural Information Retrieval

In [18]:
import datasets
import math
from functools import lru_cache
import numpy as np


In [2]:
nu_wikipedia = datasets.load_dataset("nuprl/engineering-llm-systems", "wikipedia-northeastern-university", split="test")
nu_wikipedia

test-00000-of-00001.parquet:   0%|          | 0.00/12.9M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2434 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 2434
})

In [12]:
nu_wikipedia[0]["title"], nu_wikipedia[200]["title"]

('British Columbia', 'Reggie Lewis')

In [65]:
def term_frequency(document: str, term: str):
    return document.count(term)

@lru_cache(maxsize=None)
def inverse_document_frequency(term: str):
    num_docs_with_term = sum(1 for item in nu_wikipedia if term in item["text"])
    return math.log(len(nu_wikipedia) / (1 + num_docs_with_term))

In [66]:
print("Science", inverse_document_frequency("science"))
print("Basketball", inverse_document_frequency("basketball"))
print("Northeastern", inverse_document_frequency("Northeastern"))
print("co-op", inverse_document_frequency("co-op"))
print("Beanpot", inverse_document_frequency("Beanpot"))
print("Khoury", inverse_document_frequency("Khoury"))

Science 1.655253867960116
Basketball 2.25211382906791
Northeastern -0.0004107619692176559
co-op 3.9686498770583776
Beanpot 3.9686498770583776
Khoury 4.801558999993482


In [101]:
def term_frequency(document: str, term: str):
    c = document.count(term)
    return 0 if c == 0 else 1 + math.log(c)
    # return document.count(term)

@lru_cache(maxsize=None)
def inverse_document_frequency(term: str):
    num_docs_with_term = sum(1 for item in nu_wikipedia if term in item["text"])
    return math.log(len(nu_wikipedia) / (1 + num_docs_with_term))

def compute_tf_idf_vector_unnormalized(terms, document: str):
    return [ term_frequency(document, term) * inverse_document_frequency(term) for term in terms ]

def compute_tf_idf_vector(terms, document: str):
    vec = compute_tf_idf_vector_unnormalized(terms, document)
    return vec

def compute_cosine_similarity(vec1, vec2):
    vec1_norm = np.linalg.norm(vec1)
    vec2_norm = np.linalg.norm(vec2)

    if vec1_norm == 0 or vec2_norm == 0:
        return 0
    
    return np.dot(vec1, vec2) / (vec1_norm * vec2_norm)


### Examples

In [84]:
computer_docs = [ item["text"] for item in nu_wikipedia if "computer" in item["text"] ]

In [85]:
compute_tf_idf_vector_unnormalized(["Northeastern", "computer", "science"], computer_docs[0])

[-0.0004107619692176559, 3.9725513470329585, 5.768405211541517]

In [86]:
compute_tf_idf_vector_unnormalized(["Northeastern", "computer", "science"], computer_docs[1])

[-0.0004107619692176559, 3.9725513470329585, 0.0]

In [87]:
compute_tf_idf_vector_unnormalized(["Northeastern", "computer", "science"], "computer computer computer computer computer computer"), compute_tf_idf_vector_unnormalized(["Northeastern", "computer", "science"], "computer computer computer computer computer computer computer computer computer computer computer computer computer computer computer computer computer computer")

([-0.0, 6.550173527387139, 0.0], [-0.0, 9.12779570774132, 0.0])

In [88]:
compute_tf_idf_vector(["Northeastern", "computer", "science"], "computer computer computer computer computer computer"), compute_tf_idf_vector(["Northeastern", "computer", "science"], "computer computer computer computer computer computer computer computer computer computer computer computer computer computer computer computer computer")

(array([-0.,  1.,  0.]), array([-0.,  1.,  0.]))

## Continued

In [102]:
def rank_by_tf_idf(query: str):
    query_vec = compute_tf_idf_vector(query.split(), query)
    return sorted(nu_wikipedia, key=lambda x: compute_cosine_similarity(query_vec, compute_tf_idf_vector(query.split(), x["text"])), reverse=True)

In [105]:
cs_docs = rank_by_tf_idf("computer science")

In [106]:
for item in cs_docs[:10]:
    print(item["title"], item["url"])


Northeastern University (China) https://en.wikipedia.org/wiki/Northeastern%20University%20%28China%29
Eduardo D. Sontag https://en.wikipedia.org/wiki/Eduardo%20D.%20Sontag
Georg Hajdu https://en.wikipedia.org/wiki/Georg%20Hajdu
University of Alabama in Huntsville shooting https://en.wikipedia.org/wiki/University%20of%20Alabama%20in%20Huntsville%20shooting
Richard W. Ziolkowski https://en.wikipedia.org/wiki/Richard%20W.%20Ziolkowski
Erich Neuwirth https://en.wikipedia.org/wiki/Erich%20Neuwirth
Marianne Schmid Mast https://en.wikipedia.org/wiki/Marianne%20Schmid%20Mast
Scheme (programming language) https://en.wikipedia.org/wiki/Scheme%20%28programming%20language%29
Weston, Massachusetts https://en.wikipedia.org/wiki/Weston%2C%20Massachusetts
Law school in the United States https://en.wikipedia.org/wiki/Law%20school%20in%20the%20United%20States


In [109]:
cs_profs = rank_by_tf_idf("Northeastern University computer science professor")
for item in cs_profs[:20]:
    print(item["title"], item["url"])

Weston, Massachusetts https://en.wikipedia.org/wiki/Weston%2C%20Massachusetts
Ronald Williams https://en.wikipedia.org/wiki/Ronald%20Williams
Georg Hajdu https://en.wikipedia.org/wiki/Georg%20Hajdu
Arthur Bronwell https://en.wikipedia.org/wiki/Arthur%20Bronwell
Eindhoven University of Technology https://en.wikipedia.org/wiki/Eindhoven%20University%20of%20Technology
Massachusetts Institute of Technology https://en.wikipedia.org/wiki/Massachusetts%20Institute%20of%20Technology
Rupal Patel (scientist) https://en.wikipedia.org/wiki/Rupal%20Patel%20%28scientist%29
Erich Neuwirth https://en.wikipedia.org/wiki/Erich%20Neuwirth
Matthias Felleisen https://en.wikipedia.org/wiki/Matthias%20Felleisen
Deaths in January 2020 https://en.wikipedia.org/wiki/Deaths%20in%20January%202020
Northeastern University https://en.wikipedia.org/wiki/Northeastern%20University
List of Arab Americans https://en.wikipedia.org/wiki/List%20of%20Arab%20Americans
University of Massachusetts Boston https://en.wikipedia.or

In [110]:
cs_profs = rank_by_tf_idf("Tell me who teaches at Khoury College?")
for item in cs_profs[:20]:
    print(item["title"], item["url"])

Tina Eliassi-Rad https://en.wikipedia.org/wiki/Tina%20Eliassi-Rad
Middle Eastern Americans https://en.wikipedia.org/wiki/Middle%20Eastern%20Americans
List of Arab Americans https://en.wikipedia.org/wiki/List%20of%20Arab%20Americans
List of Lebanese Americans https://en.wikipedia.org/wiki/List%20of%20Lebanese%20Americans
List of Northeastern University people https://en.wikipedia.org/wiki/List%20of%20Northeastern%20University%20people
List of Lebanese people https://en.wikipedia.org/wiki/List%20of%20Lebanese%20people
Northeastern University https://en.wikipedia.org/wiki/Northeastern%20University
Nader Tehrani https://en.wikipedia.org/wiki/Nader%20Tehrani
Amin Khoury https://en.wikipedia.org/wiki/Amin%20Khoury
Khoury College of Computer Sciences https://en.wikipedia.org/wiki/Khoury%20College%20of%20Computer%20Sciences
Matthias Felleisen https://en.wikipedia.org/wiki/Matthias%20Felleisen
Andrea Grimes Parker https://en.wikipedia.org/wiki/Andrea%20Grimes%20Parker
Carla Brodley https://en.w

## Neural Information Retrieval

In [111]:
import torch
from transformers import AutoTokenizer, AutoModel

In [112]:
model = AutoModel.from_pretrained("answerdotai/ModernBERT-base")
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

  Referenced from: <EB3FF92A-5EB1-3EE8-AF8B-5923C1265422> /Users/arjun/miniconda3/lib/python3.11/site-packages/torchvision/image.so
  warn(


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [113]:
model

ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=768, out_features=768, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=768, out_features=2304, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=1152, out_features=768, bias=False)
      )
    )
    (1-21): 21 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((768,), eps=1e-05, e

In [149]:
with torch.no_grad():
    query_vec = model(**tokenizer("Who teaches at Khoury college?", return_tensors="pt")).last_hidden_state[0, 0]

In [150]:
with torch.no_grad():
    tina_vec = model(**tokenizer(cs_profs[0]["text"], return_tensors="pt")).last_hidden_state[0, 0]

In [151]:
with torch.no_grad():
    mideast_american_vec = model(**tokenizer(cs_profs[1]["text"], return_tensors="pt", truncation=True)).last_hidden_state[0, 0]

In [152]:
with torch.no_grad():
    nader_vec = model(**tokenizer(cs_profs[7]["text"], return_tensors="pt", truncation=True)).last_hidden_state[0, 0]

In [153]:
compute_cosine_similarity(query_vec.numpy(), tina_vec.numpy()), compute_cosine_similarity(query_vec.numpy(), mideast_american_vec.numpy())

(0.38015804, 0.2078781)

In [154]:
compute_cosine_similarity(query_vec.numpy(), nader_vec.numpy())

0.32515612

In [155]:
with torch.no_grad():
    query_vec = model(**tokenizer("Where is the nearest restaurant from Khoury College?", return_tensors="pt")).last_hidden_state[0, -1]

In [156]:
compute_cosine_similarity(query_vec.numpy(), tina_vec.numpy()), compute_cosine_similarity(query_vec.numpy(), mideast_american_vec.numpy()), compute_cosine_similarity(query_vec.numpy(), nader_vec.numpy())

(0.05845735, -0.11330267, 0.0133042075)

In [147]:
tokenizer("Hello"), tokenizer("I am going.")

({'input_ids': [50281, 12092, 50282], 'attention_mask': [1, 1, 1]},
 {'input_ids': [50281, 42, 717, 1469, 15, 50282], 'attention_mask': [1, 1, 1, 1, 1, 1]})

In [148]:
tokenizer

PreTrainedTokenizerFast(name_or_path='answerdotai/ModernBERT-base', vocab_size=50280, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("|||IP_ADDRESS|||", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50257: AddedToken("                    