guides
Text Analytics and NLP
Tokenisation, n-grams, TF-IDF, and simple NLP pipelines.
Published May 30, 2026
Word Tokenisation
def tokenize(text: str) -> list[str]:
words = []
current = []
for ch in text.lower():
if ch.isalpha():
current.append(ch)
else:
if current:
words.append(''.join(current))
current = []
if current:
words.append(''.join(current))
return words
N-grams
def ngrams(tokens: list[str], n: int) -> list[tuple[str, ...]]:
return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
TF-IDF Scoring
from math import log
def tfidf(term: str, doc: list[str], corpus: list[list[str]]) -> float:
tf = doc.count(term) / len(doc)
df = sum(1 for d in corpus if term in d)
idf = log(len(corpus) / (df + 1))
return tf * idf