# Non-Neural Information Retrieval

In [27]:
import datasets
import math
from functools import lru_cache


In [11]:
nu_wikipedia = datasets.load_dataset("nuprl/engineering-llm-systems", "wikipedia-northeastern-university", split="test")
nu_wikipedia

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 2434
})

In [12]:
nu_wikipedia[0]["title"], nu_wikipedia[200]["title"]

('British Columbia', 'Reggie Lewis')

In [16]:
def term_frequency(document: str, term: str):
    return document.count(term)

def term_frequency_query(document: str, query: str):
    return sum([ term_frequency(document, term) for term in query.split() ])

def rank_by_term_frequency(query: str):
    return sorted(nu_wikipedia, key=lambda x: term_frequency_query(x["text"], query), reverse=True)

In [18]:
basketball_results = rank_by_term_frequency("basketball")[:10]
for item in basketball_results:
    print(item["title"], item["url"])


List of college rivalries in the United States https://en.wikipedia.org/wiki/List%20of%20college%20rivalries%20in%20the%20United%20States
Sydney Johnson https://en.wikipedia.org/wiki/Sydney%20Johnson
DeMatha Catholic High School https://en.wikipedia.org/wiki/DeMatha%20Catholic%20High%20School
Mike Jarvis https://en.wikipedia.org/wiki/Mike%20Jarvis
Jim Calhoun https://en.wikipedia.org/wiki/Jim%20Calhoun
Michael E. Long https://en.wikipedia.org/wiki/Michael%20E.%20Long
Manny Harris https://en.wikipedia.org/wiki/Manny%20Harris
Big East Conference (1979–2013) https://en.wikipedia.org/wiki/Big%20East%20Conference%20%281979%E2%80%932013%29
Jim Larrañaga https://en.wikipedia.org/wiki/Jim%20Larra%C3%B1aga
Shawn James https://en.wikipedia.org/wiki/Shawn%20James


In [19]:
basketball_results = rank_by_term_frequency("computer science")[:10]
for item in basketball_results:
    print(item["title"], item["url"])


List of political scientists https://en.wikipedia.org/wiki/List%20of%20political%20scientists
Humanities https://en.wikipedia.org/wiki/Humanities
Massachusetts Institute of Technology https://en.wikipedia.org/wiki/Massachusetts%20Institute%20of%20Technology
Khoury College of Computer Sciences https://en.wikipedia.org/wiki/Khoury%20College%20of%20Computer%20Sciences
Middle Eastern Americans https://en.wikipedia.org/wiki/Middle%20Eastern%20Americans
Nu Rho Psi https://en.wikipedia.org/wiki/Nu%20Rho%20Psi
List of Indian Americans https://en.wikipedia.org/wiki/List%20of%20Indian%20Americans
Computer and network surveillance https://en.wikipedia.org/wiki/Computer%20and%20network%20surveillance
List of Massachusetts Institute of Technology alumni https://en.wikipedia.org/wiki/List%20of%20Massachusetts%20Institute%20of%20Technology%20alumni
List of Columbia College people https://en.wikipedia.org/wiki/List%20of%20Columbia%20College%20people


## Inverse Document Frequency

In [28]:
@lru_cache(maxsize=None)
def inverse_document_frequency(term: str):
    num_docs_with_term = sum(1 for item in nu_wikipedia if term in item["text"])
    return math.log(len(nu_wikipedia) / num_docs_with_term)

In [26]:
print("Science", inverse_document_frequency("science"))
print("Basketball", inverse_document_frequency("basketball"))
print("Northeastern", inverse_document_frequency("Northeastern"))
print("co-op", inverse_document_frequency("co-op"))
print("Beanpot", inverse_document_frequency("Beanpot"))
print("Khoury", inverse_document_frequency("Khoury"))

Science 1.6574067213212171
Basketball 2.2560277283890464
Northeastern 0.0
co-op 3.9906287837771526
Beanpot 3.9906287837771526
Khoury 4.852852294381032


In [25]:
sum(1 for item in nu_wikipedia if "Beanpot" in item["text"]), sum(1 for item in nu_wikipedia if "co-op" in item["text"])

(45, 45)

In [32]:
def compute_tf_idf_bad(document: str, query: str):
    return sum([ term_frequency(document, term) * inverse_document_frequency(term) for term in query.split() ])

def rank_by_tf_idf_bad(query: str):
    return sorted(nu_wikipedia, key=lambda x: compute_tf_idf_bad(x["text"], query), reverse=True)


In [None]:
cs_results = rank_by_tf_idf_bad("computer science")[:10]
for item in cs_results:
    print(item["title"], item["url"])


List of political scientists https://en.wikipedia.org/wiki/List%20of%20political%20scientists
Massachusetts Institute of Technology https://en.wikipedia.org/wiki/Massachusetts%20Institute%20of%20Technology
Khoury College of Computer Sciences https://en.wikipedia.org/wiki/Khoury%20College%20of%20Computer%20Sciences
Humanities https://en.wikipedia.org/wiki/Humanities
Middle Eastern Americans https://en.wikipedia.org/wiki/Middle%20Eastern%20Americans
List of Indian Americans https://en.wikipedia.org/wiki/List%20of%20Indian%20Americans
Computer and network surveillance https://en.wikipedia.org/wiki/Computer%20and%20network%20surveillance
List of Massachusetts Institute of Technology alumni https://en.wikipedia.org/wiki/List%20of%20Massachusetts%20Institute%20of%20Technology%20alumni
Nu Rho Psi https://en.wikipedia.org/wiki/Nu%20Rho%20Psi
List of University of Michigan faculty and staff https://en.wikipedia.org/wiki/List%20of%20University%20of%20Michigan%20faculty%20and%20staff


In [None]:
def compute_tf_idf_vector(terms, document: str):
    return [ term_frequency(document, term) * inverse_document_frequency(term) for term in terms ]




** Will complete next class. **