September 1, 2025
The engine builds an inverted index, mapping each normalized token to the set of document names that contain it by using defaultdict
. Tokenization extracts lowercase word token using the regex \b\w+\b
to ensure the word boundary and it also make sure that uppercase and lowercase are treated same during the queries. Then it is added added to raw document text inside self.docs so we can retrieve it later. Then it tokenizes each query using same regex and take intersection of those sets which is also referred as And semantics
. And last using get_snippet
it returns a small piece of text around the match.
def __init__(self):
self.index = defaultdict(set)
self.docs = {}
# helper: return list of words from text (normalized)
def tokenize(self, text: str):
"""Return lowercase word tokens from text."""
if not text:
return []
return re.findall(r"\b\w+\b", text.lower())
def add_document(self, name: str, content: str):
"""Adds a document to the index."""
self.docs[name] = content
for word in self.tokenize(content):
self.index[word].add(name)
def search(self, query: str):
"""Search for documents that contain all words in the query (AND)."""
words = self.tokenize(query)
if not words:
return set()
sets = [self.index.get(w, set()) for w in words]
if not sets or any(len(s) == 0 for s in sets):
return set()
return set.intersection(*sets)
def get_snippet(self, name: str, query: str, context: int = 40) -> str:
"""Return a short snippet around the first whole-word match for any query word/phrase."""
content = self.docs.get(name, "")
if not content:
return ""
q = query.strip()
if not q:
return ""
# try phrase match first (word boundaries), then any token
phrase_pat = re.compile(r"\b" + re.escape(q) + r"\b", flags=re.IGNORECASE)
m = phrase_pat.search(content)
if not m:
parts = self.tokenize(q)
if parts:
pat = re.compile(r"\b(" + "|".join(re.escape(p) for p in parts) + r")\b", flags=re.IGNORECASE)
m = pat.search(content)
if not m:
# fallback: start of document
s = content.strip().replace("\n", " ")
return (s[:context] + ("..." if len(s) > context else ""))
start = max(0, m.start() - context)
end = min(len(content), m.end() + context)
snippet = content[start:end].replace("\n", " ")
return ("..." if start > 0 else "") + snippet + ("..." if end < len(content) else "")
You can simply run this program with
search_documents
)
if query.lower() in doc.lower():
Index
class)
The simple version shows tractability in its most basic form (linear scan).
The improved version shows how indexing makes searching scale much better in practice.
Proofgrammers