guides

Text Analytics and NLP

Tokenisation, n-grams, TF-IDF, and simple NLP pipelines.

Published May 30, 2026

Word Tokenisation

def tokenize(text: str) -> list[str]:
    words = []
    current = []
    for ch in text.lower():
        if ch.isalpha():
            current.append(ch)
        else:
            if current:
                words.append(''.join(current))
                current = []
    if current:
        words.append(''.join(current))
    return words

N-grams

def ngrams(tokens: list[str], n: int) -> list[tuple[str, ...]]:
    return [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]

TF-IDF Scoring

from math import log

def tfidf(term: str, doc: list[str], corpus: list[list[str]]) -> float:
    tf = doc.count(term) / len(doc)
    df = sum(1 for d in corpus if term in d)
    idf = log(len(corpus) / (df + 1))
    return tf * idf