🧠 NLP & Text Processing

24 topics • Click any card to expand

1. Text Cleaning & Preprocessing▼

Raw text is noisy. Learn to normalize case, remove punctuation, strip HTML, handle Unicode, and build reusable cleaning pipelines.

Basic text normalization

import re, string

text = '  Hello, World! This is NLP 101... Check <b>this</b> out!  '

# Lowercase
clean = text.lower()
# Strip HTML tags
clean = re.sub(r'<[^>]+>', '', clean)
# Remove punctuation
clean = clean.translate(str.maketrans('', '', string.punctuation))
# Collapse whitespace
clean = ' '.join(clean.split())
print(repr(clean))
# Output: 'hello world this is nlp 101 check this out'

Regex-based cleaning patterns

import re

def clean_text(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)   # URLs
    text = re.sub(r'@\w+', '', text)                 # @mentions
    text = re.sub(r'#\w+', '', text)                 # hashtags
    text = re.sub(r'[^\w\s]', '', text)             # punctuation
    text = re.sub(r'\d+', '', text)                  # digits
    text = re.sub(r'\s+', ' ', text).strip()         # whitespace
    return text.lower()

tweet = 'Check out https://example.com #NLP @user! 123 Great stuff!!!'
print(clean_text(tweet))
# Output: 'check out  great stuff'

Unicode normalization and encoding fixes

import unicodedata

text = 'Café naïve résumé — quotes \u2018smart\u2019'

# Normalize to ASCII (strip accents)
def to_ascii(text):
    nfkd = unicodedata.normalize('NFKD', text)
    return ''.join(c for c in nfkd if not unicodedata.combining(c))

# Normalize smart quotes to straight
def fix_quotes(text):
    replacements = {\u2018: "'", \u2019: "'", \u201c: '"', \u201d: '"',
                    \u2013: '-', \u2014: '-'}
    return ''.join(replacements.get(c, c) for c in text)

print(to_ascii(text))
print(fix_quotes(text))

Building a reusable cleaning pipeline

import re, string
from typing import List, Callable

def make_pipeline(*fns: Callable) -> Callable:
    def pipeline(text: str) -> str:
        for fn in fns:
            text = fn(text)
        return text
    return pipeline

lowercase       = str.lower
remove_urls     = lambda t: re.sub(r'http\S+', '', t)
remove_punct    = lambda t: t.translate(str.maketrans('', '', string.punctuation))
collapse_spaces = lambda t: ' '.join(t.split())

clean = make_pipeline(lowercase, remove_urls, remove_punct, collapse_spaces)

texts = ['Visit https://ai.com for more!', 'Hello, World!!', '  PYTHON  NLP  ']
print([clean(t) for t in texts])

💼 Real-World Scenario

A customer support team wants to preprocess thousands of support tickets before feeding them to a classifier. Tickets contain HTML, URLs, emojis, and inconsistent casing.

Real-World Code

import re, string

def preprocess_ticket(text: str) -> str:
    text = re.sub(r'<[^>]+>', ' ', text)       # strip HTML
    text = re.sub(r'http\S+', '', text)        # remove URLs
    text = text.encode('ascii', 'ignore').decode()  # strip emoji/unicode
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)      # punctuation -> space
    text = ' '.join(text.split())               # normalize whitespace
    return text

tickets = [
    '<p>My <b>order</b> #12345 is LATE! See https://track.com/12345</p>',
    'App crashed 😤 after update v2.1 — please fix ASAP!!!',
]
for t in tickets:
    print(preprocess_ticket(t))

🏋️ Practice: Email Cleaner

Write a function that strips email headers (From:, To:, Subject:), removes quoted lines starting with '>', and cleans leftover whitespace.

Starter Code

import re

email = '''
From: alice@example.com
To: bob@example.com
Subject: Project Update

Hi Bob,

> Thanks for the report
> it was helpful

Looks great! Let's sync tomorrow.
'''

def clean_email(text: str) -> str:
    # TODO: Remove header lines (From:, To:, Subject:)
    # TODO: Remove quoted lines (starting with >)
    # TODO: Collapse blank lines
    pass

print(clean_email(email))

✅ Practice Checklist

Lowercase a string and strip HTML tags using re.sub(r'<[^>]+>', '', text)
Remove URLs, @mentions, and hashtags from a tweet using three separate regex patterns
Apply unicode normalization to strip accent characters from foreign text
Build a text cleaning pipeline as a list of lambda functions applied sequentially
Compare cleaned vs uncleaned text token counts and report reduction percentage

2. Tokenization & Stopword Removal▼

Tokenization splits text into meaningful units. Stopword removal filters common words that carry little semantic weight.

Word and sentence tokenization with NLTK

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import word_tokenize, sent_tokenize

text = 'Dr. Smith said NLP is fun. It really is! Don\'t you think?'

sentences = sent_tokenize(text)
print('Sentences:', sentences)

words = word_tokenize(text)
print('Words:', words)

Stopword removal with NLTK

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

text = 'The quick brown fox jumps over the lazy dog'
tokens = word_tokenize(text.lower())
filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

print('Original tokens:', tokens)
print('Filtered:', filtered)

Tokenization with spaCy

try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    text = 'Apple is looking at buying U.K. startup for $1 billion.'
    doc = nlp(text)
    tokens = [(t.text, t.pos_, t.is_stop) for t in doc]
    print('(token, POS, is_stop):')
    for tok in tokens:
        print(tok)
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

Subword tokenization with HuggingFace

try:
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    text = 'Tokenization handles out-of-vocabulary words cleverly.'
    tokens = tokenizer.tokenize(text)
    ids = tokenizer.encode(text)

    print('Subword tokens:', tokens)
    print('Token IDs:', ids)
    print('Decoded:', tokenizer.decode(ids))
except ImportError:
    print('pip install transformers')

💼 Real-World Scenario

A search engine needs to index product reviews. Tokenize and filter stopwords to extract the keywords that matter for search relevance.

Real-World Code

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

STOP = set(stopwords.words('english'))

def extract_keywords(review: str, top_n: int = 5):
    tokens = word_tokenize(review.lower())
    keywords = [w for w in tokens if w.isalpha() and w not in STOP and len(w) > 2]
    return Counter(keywords).most_common(top_n)

review = 'The battery life is amazing. This phone has the best battery I have ever used. Great camera too.'
print(extract_keywords(review))

🏋️ Practice: Custom Stopword Filter

Extend the standard NLTK stopword list with domain-specific words (e.g., 'customer', 'product', 'order') and filter a list of reviews.

Starter Code

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

DOMAIN_STOPS = {'customer', 'product', 'order', 'item', 'purchase'}

def filter_tokens(text: str) -> list:
    # TODO: combine NLTK stopwords + DOMAIN_STOPS
    # TODO: tokenize, lowercase, filter
    pass

reviews = [
    'Customer service was excellent, product quality amazing',
    'Order arrived late but item was in perfect condition',
]
for r in reviews:
    print(filter_tokens(r))

✅ Practice Checklist

Tokenize a paragraph using both word_tokenize and sent_tokenize — compare output
Remove stopwords from a sentence and count how many words remain
Apply stemming vs lemmatization to the same words and compare the results
Tokenize a tweet with hashtags/emojis — note how standard tokenizers handle it
Build a simple word frequency counter using Counter on tokenized text

3. Stemming & Lemmatization▼

Reduce words to their base forms. Stemming is fast but crude (running→run). Lemmatization is linguistically accurate (better→good).

Porter stemmer with NLTK

import nltk
from nltk.stem import PorterStemmer, SnowballStemmer

porter = PorterStemmer()
snowball = SnowballStemmer('english')

words = ['running', 'flies', 'happily', 'studies', 'beautiful', 'caring']

print(f'{'Word':<15} {'Porter':<15} {'Snowball':<15}')
for w in words:
    print(f'{w:<15} {porter.stem(w):<15} {snowball.stem(w):<15}')

WordNet lemmatizer with POS tags

import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

# POS matters: 'better' as ADJ -> 'good', as VERB -> 'better'
examples = [
    ('better',  'a'),   # adjective
    ('running', 'v'),   # verb
    ('geese',   'n'),   # noun
    ('happily', 'r'),   # adverb
]
for word, pos in examples:
    lem = lemmatizer.lemmatize(word, pos=pos)
    print(f'{word} ({pos}) -> {lem}')

Lemmatization with spaCy

try:
    import spacy
    nlp = spacy.load('en_core_web_sm')

    text = 'The children were running and the geese were flying'
    doc = nlp(text)

    print(f'{'Token':<15} {'Lemma':<15} {'POS':<10}')
    for token in doc:
        print(f'{token.text:<15} {token.lemma_:<15} {token.pos_:<10}')
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

Stemming vs Lemmatization comparison

import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

words = ['studies', 'studying', 'better', 'wolves', 'corpora', 'matrices']

print(f'{'Word':<12} {'Stem':<12} {'Lemma (n)':<12}')
for w in words:
    stem = stemmer.stem(w)
    lemma = lemmatizer.lemmatize(w, pos='n')
    print(f'{w:<12} {stem:<12} {lemma:<12}')

💼 Real-World Scenario

A job board wants to match resumes to job postings regardless of tense or form — 'managed', 'manages', 'management' should all map to the same root concept.

Real-World Code

import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

lem = WordNetLemmatizer()
STOP = set(stopwords.words('english'))

def normalize_skills(text: str) -> set:
    tokens = word_tokenize(text.lower())
    return {lem.lemmatize(w, 'v') for w in tokens if w.isalpha() and w not in STOP}

job_req  = 'Managed budgets, leading teams, developed strategies'
resume   = 'manages budgets, leads cross-functional teams, develop product strategy'

job_kw  = normalize_skills(job_req)
res_kw  = normalize_skills(resume)
overlap = job_kw & res_kw
print('Match score:', len(overlap) / len(job_kw))
print('Matched keywords:', overlap)

🏋️ Practice: Search Term Normalizer

Write a function that takes a search query and returns all lemma forms so a search for 'running shoes' also matches 'run' and 'shoe'.

Starter Code

import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lem = WordNetLemmatizer()

def normalize_query(query: str) -> list:
    # TODO: tokenize, lemmatize as both noun and verb, deduplicate
    pass

queries = ['running shoes', 'buying products', 'managed accounts']
for q in queries:
    print(q, '->', normalize_query(q))

✅ Practice Checklist

Apply PorterStemmer to a list of words and print stem vs original side-by-side
Use WordNetLemmatizer with POS='v' on verbs and POS='n' on nouns — compare outputs
Try lemmatizing 'better' as adjective ('a') vs verb ('v') and see the difference
Use spaCy to lemmatize a full sentence and print token/lemma/POS for each token
Compare stemmer vs lemmatizer on the words: 'wolves', 'studies', 'corpora', 'better'

4. Named Entity Recognition (NER)▼

NER identifies and classifies named entities (persons, organizations, locations, dates) in text — essential for information extraction.

NER with spaCy

try:
    import spacy
    nlp = spacy.load('en_core_web_sm')

    text = 'Apple Inc. was founded by Steve Jobs in Cupertino on April 1, 1976.'
    doc = nlp(text)

    print('Entities found:')
    for ent in doc.ents:
        print(f'  {ent.text:<25} {ent.label_:<12} {spacy.explain(ent.label_)}')
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

NER with NLTK chunking

import nltk
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

text = 'Barack Obama served as the 44th President of the United States.'
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
chunks = ne_chunk(tagged)

for subtree in chunks:
    if isinstance(subtree, Tree):
        entity = ' '.join(word for word, tag in subtree.leaves())
        print(f'{entity}: {subtree.label()}')

Visualizing entities with displacy

try:
    import spacy
    from spacy import displacy
    nlp = spacy.load('en_core_web_sm')

    text = 'Elon Musk founded SpaceX in Hawthorne, California in 2002.'
    doc = nlp(text)

    # In a Jupyter notebook this renders inline:
    # displacy.render(doc, style='ent')

    # Save to HTML:
    html = displacy.render(doc, style='ent', page=True)
    print(html[:200], '...')  # Show snippet
    print('\nEntities:', [(e.text, e.label_) for e in doc.ents])
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

Custom NER with spaCy EntityRuler

try:
    import spacy
    nlp = spacy.load('en_core_web_sm')

    # Add custom entity patterns
    ruler = nlp.add_pipe('entity_ruler', before='ner')
    patterns = [
        {'label': 'TECH_STACK', 'pattern': 'Python'},
        {'label': 'TECH_STACK', 'pattern': 'TensorFlow'},
        {'label': 'TECH_STACK', 'pattern': [{'LOWER': 'scikit'}, {'LOWER': '-'}, {'LOWER': 'learn'}]},
    ]
    ruler.add_patterns(patterns)

    doc = nlp('We use Python and TensorFlow with scikit-learn for ML.')
    for ent in doc.ents:
        print(f'{ent.text}: {ent.label_}')
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

💼 Real-World Scenario

A financial news aggregator wants to automatically tag articles with mentioned companies, CEOs, and market figures to power a search index.

Real-World Code

try:
    import spacy
    from collections import defaultdict
    nlp = spacy.load('en_core_web_sm')

    articles = [
        'Tesla CEO Elon Musk announced record deliveries in Q4 2024.',
        'Microsoft acquired Activision Blizzard for $68.7 billion.',
        'Warren Buffett increased Berkshire Hathaway stake in Apple.',
    ]

    entity_index = defaultdict(list)
    for i, art in enumerate(articles):
        doc = nlp(art)
        for ent in doc.ents:
            if ent.label_ in ('ORG', 'PERSON', 'MONEY', 'DATE'):
                entity_index[ent.text].append(i)

    for entity, article_ids in entity_index.items():
        print(f'{entity}: articles {article_ids}')
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

🏋️ Practice: Entity Frequency Counter

Process a list of news headlines, extract all PERSON and ORG entities, and return the top 5 most mentioned entities.

Starter Code

headlines = [
    'Google CEO Sundar Pichai unveils new AI products at Google I/O',
    'Apple and Google partner on health data standards',
    'Jeff Bezos steps down as Amazon CEO',
    'Amazon reports record profits under Andy Jassy',
    'Sundar Pichai defends Google search monopoly in court',
]

def top_entities(texts, n=5):
    # TODO: load spacy, extract PERSON + ORG entities
    # TODO: count frequencies, return top n
    pass

print(top_entities(headlines))

✅ Practice Checklist

Run spaCy NER on a news sentence and print each entity's text, label, and explanation
Use NLTK ne_chunk on a tokenized sentence and extract entity trees
Add a custom entity_ruler to spaCy to recognize your own product names
Process 5 headlines and count which entity types (ORG, PERSON, GPE) appear most
Use displacy.render() to visualize entities in a sentence and save the HTML output

5. Sentiment Analysis▼

Determine whether text expresses positive, negative, or neutral sentiment. Learn rule-based (VADER), ML-based, and transformer approaches.

VADER sentiment (rule-based)

import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

texts = [
    'The food was absolutely amazing and the service was great!',
    'Terrible experience. Never going back.',
    'The product arrived on time.',
    'Not bad, but could be better.',
]

for text in texts:
    scores = sia.polarity_scores(text)
    label = 'POSITIVE' if scores['compound'] > 0.05 else 'NEGATIVE' if scores['compound'] < -0.05 else 'NEUTRAL'
    print(f'{label}: {text[:40]:<40} | compound={scores["compound"]:.3f}')

TextBlob sentiment

try:
    from textblob import TextBlob

    reviews = [
        'Absolutely love this product! Best purchase ever.',
        'Disappointed. Quality is poor and shipping was slow.',
        'It is okay. Nothing special.',
    ]

    for review in reviews:
        blob = TextBlob(review)
        pol = blob.sentiment.polarity       # -1 to 1
        sub = blob.sentiment.subjectivity   # 0 (objective) to 1 (subjective)
        print(f'Polarity: {pol:+.2f}  Subjectivity: {sub:.2f}  | {review[:40]}')
except ImportError:
    print('pip install textblob')

Transformer-based sentiment with pipeline

try:
    from transformers import pipeline

    sentiment = pipeline('sentiment-analysis',
                         model='distilbert-base-uncased-finetuned-sst-2-english')

    texts = [
        'I love this movie so much!',
        'This is the worst product I have ever bought.',
        'The package arrived in reasonable time.',
    ]
    results = sentiment(texts)
    for text, result in zip(texts, results):
        print(f'{result["label"]:<10} ({result["score"]:.3f}): {text}')
except ImportError:
    print('pip install transformers torch')

Aspect-based sentiment (simple rule approach)

import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

ASPECTS = {
    'battery': ['battery', 'charge', 'charging', 'power'],
    'camera':  ['camera', 'photo', 'picture', 'image'],
    'screen':  ['screen', 'display', 'resolution'],
}

def aspect_sentiment(review):
    results = {}
    sentences = nltk.sent_tokenize(review)
    for aspect, keywords in ASPECTS.items():
        relevant = [s for s in sentences if any(k in s.lower() for k in keywords)]
        if relevant:
            scores = [sia.polarity_scores(s)['compound'] for s in relevant]
            results[aspect] = sum(scores) / len(scores)
    return results

review = 'Battery life is excellent! But the camera quality is disappointing. The screen is stunning.'
print(aspect_sentiment(review))

💼 Real-World Scenario

An e-commerce platform wants to automatically classify product reviews as positive/negative and flag negative ones for customer service follow-up.

Real-World Code

import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dataclasses import dataclass
from typing import List

@dataclass
class Review:
    id: int
    text: str
    product_id: str

sia = SentimentIntensityAnalyzer()

def triage_reviews(reviews: List[Review]):
    negative = []
    for r in reviews:
        score = sia.polarity_scores(r.text)['compound']
        if score < -0.3:
            negative.append((r, score))
    return sorted(negative, key=lambda x: x[1])  # worst first

reviews = [
    Review(1, 'Love it! Works perfectly.', 'P001'),
    Review(2, 'Completely broken. Total waste of money.', 'P002'),
    Review(3, 'Item never arrived. Terrible service!', 'P003'),
]
flagged = triage_reviews(reviews)
for review, score in flagged:
    print(f'Review {review.id} (score={score:.3f}): {review.text}')

🏋️ Practice: Sentiment Dashboard

Given a list of tweets, compute the daily average sentiment score and print a summary showing whether the day was overall positive or negative.

Starter Code

import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

tweets = [
    ('2024-01-01', 'Great start to the new year! So excited!'),
    ('2024-01-01', 'Traffic was awful this morning.'),
    ('2024-01-02', 'Amazing concert last night!'),
    ('2024-01-02', 'Concert tickets were overpriced but show was okay'),
    ('2024-01-02', 'Best night ever, loved every minute!'),
]

def daily_sentiment(tweets):
    sia = SentimentIntensityAnalyzer()
    # TODO: group by date, average compound scores
    # TODO: label each day as POSITIVE / NEGATIVE / NEUTRAL
    pass

daily_sentiment(tweets)

✅ Practice Checklist

Run VADER on 5 different sentences and compare compound scores
Apply TextBlob to compute polarity and subjectivity on a review
Use a HuggingFace sentiment pipeline and compare its labels to VADER's
Build an aspect-based sentiment function for battery/camera/screen in phone reviews
Group tweets by date, average their compound scores, and label each day as +/-/neutral

6. Text Similarity & Vectorization▼

Convert text to numerical representations and measure similarity. Covers Bag-of-Words, TF-IDF, cosine similarity, and word embeddings.

TF-IDF vectorization with sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

corpus = [
    'the cat sat on the mat',
    'the dog lay on the rug',
    'cats and dogs are both great pets',
]

vec = TfidfVectorizer(stop_words='english')
X = vec.fit_transform(corpus)

df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
print(df.round(3))

Cosine similarity between documents

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

docs = [
    'Python is great for data science',
    'Data science uses Python and R',
    'I love cooking Italian food',
    'Machine learning with Python and sklearn',
]

vec = TfidfVectorizer(stop_words='english')
X = vec.fit_transform(docs)
sim = cosine_similarity(X)

print('Similarity matrix:')
for i, row in enumerate(sim):
    print(f'Doc {i}: {[f"{v:.2f}" for v in row]}')

Word2Vec embeddings with gensim

try:
    from gensim.models import Word2Vec
    import nltk
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    from nltk.tokenize import word_tokenize

    sentences = [
        'king is a powerful man',
        'queen is a powerful woman',
        'boy is a young man',
        'girl is a young woman',
    ]
    tokenized = [word_tokenize(s) for s in sentences]
    model = Word2Vec(tokenized, vector_size=50, window=3, min_count=1, epochs=100)

    # Classic word vector arithmetic
    result = model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=3)
    print('king + woman - man =', result)
    print('Similarity(king, queen):', model.wv.similarity('king', 'queen'))
except ImportError:
    print('pip install gensim')

Sentence embeddings with sentence-transformers

try:
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity

    model = SentenceTransformer('all-MiniLM-L6-v2')

    sentences = [
        'A man is playing guitar.',
        'Someone is strumming a musical instrument.',
        'A cat is sitting on the couch.',
    ]

    embeddings = model.encode(sentences)
    sim = cosine_similarity(embeddings)

    print('Semantic similarity:')
    for i in range(len(sentences)):
        for j in range(i+1, len(sentences)):
            print(f'  [{i}] vs [{j}]: {sim[i,j]:.3f}')
except ImportError:
    print('pip install sentence-transformers')

💼 Real-World Scenario

A legal tech company wants to find duplicate or near-duplicate contract clauses across thousands of documents to detect plagiarism.

Real-World Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

clauses = [
    'The contractor shall deliver all work by the agreed deadline.',
    'All deliverables must be submitted by the agreed-upon deadline.',
    'Payment shall be made within 30 days of invoice receipt.',
    'The client agrees to pay within thirty days of receiving the invoice.',
    'Confidential information must not be disclosed to third parties.',
]

vec = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X = vec.fit_transform(clauses)
sim = cosine_similarity(X)

THRESHOLD = 0.5
print('Near-duplicate pairs (similarity > 0.5):')
for i in range(len(clauses)):
    for j in range(i+1, len(clauses)):
        if sim[i, j] > THRESHOLD:
            print(f'  [{i}] & [{j}]: {sim[i,j]:.3f}')
            print(f'    {clauses[i][:60]}')
            print(f'    {clauses[j][:60]}')

🏋️ Practice: FAQ Matcher

Build a simple FAQ bot: given a user question, find the most similar FAQ entry using TF-IDF cosine similarity.

Starter Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

faqs = [
    ('How do I reset my password?', 'Go to login page and click Forgot Password.'),
    ('What payment methods are accepted?', 'We accept Visa, Mastercard and PayPal.'),
    ('How long does shipping take?', 'Standard shipping takes 5-7 business days.'),
    ('Can I return an item?', 'Yes, returns are accepted within 30 days.'),
]

def find_answer(question: str) -> str:
    # TODO: vectorize FAQ questions + user question
    # TODO: compute cosine similarity
    # TODO: return answer for most similar FAQ
    pass

print(find_answer('How can I change my password?'))
print(find_answer('Do you accept credit cards?'))

✅ Practice Checklist

Fit TF-IDF on a 3-doc corpus and inspect the feature matrix with get_feature_names_out()
Compute cosine similarity between two TF-IDF vectors and interpret the score
Train Word2Vec with gensim on tokenized sentences and find similar words
Use sentence-transformers to encode 3 sentences and compare cosine similarity
Build a FAQ matcher that returns the top-3 most similar questions above a threshold

7. Topic Modeling▼

Discover hidden thematic structure in document collections. Learn Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF).

LDA topic modeling with sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

docs = [
    'baseball team pitcher bat home run stadium',
    'football touchdown quarterback field goal referee',
    'stock market shares dividends portfolio investor',
    'bitcoin ethereum blockchain cryptocurrency wallet',
    'machine learning neural network deep learning AI',
    'python data science pandas numpy statistics',
]

vec = CountVectorizer(stop_words='english')
X = vec.fit_transform(docs)

lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)

feature_names = vec.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    top_words = [feature_names[j] for j in topic.argsort()[-6:][::-1]]
    print(f'Topic {i}: {top_words}')

NMF topic modeling

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

docs = [
    'health fitness exercise gym workout training',
    'diet nutrition calories protein weight loss',
    'travel vacation flight hotel beach tourism',
    'passport visa travel destination adventure explore',
    'cooking recipe chef kitchen ingredients bake',
]

vec = TfidfVectorizer(stop_words='english', max_features=50)
X = vec.fit_transform(docs)

nmf = NMF(n_components=3, random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

feature_names = vec.get_feature_names_out()
for i, topic in enumerate(H):
    top_words = [feature_names[j] for j in topic.argsort()[-5:][::-1]]
    print(f'Topic {i}: {top_words}')

print('\nDoc-topic assignments (W):')
for i, row in enumerate(W):
    print(f'Doc {i}: topic {row.argmax()}')

Choosing number of topics with perplexity

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Synthetic corpus
docs = (['python data science machine learning'] * 5 +
        ['football soccer game stadium team'] * 5 +
        ['stock market finance investment portfolio'] * 5)

vec = CountVectorizer(stop_words='english')
X = vec.fit_transform(docs)

perplexities = []
k_range = range(2, 7)
for k in k_range:
    lda = LatentDirichletAllocation(n_components=k, random_state=42, max_iter=20)
    lda.fit(X)
    perplexities.append(lda.perplexity(X))
    print(f'k={k}: perplexity={lda.perplexity(X):.2f}')

best_k = k_range[np.argmin(perplexities)]
print(f'Best k: {best_k}')

Assigning topic labels to new documents

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

train_docs = [
    'python programming code software developer',
    'machine learning model training dataset',
    'basketball players championship game team',
    'tennis grand slam tournament court player',
]
TOPIC_LABELS = {0: 'Technology', 1: 'Sports'}  # manual labels

vec = CountVectorizer(stop_words='english')
X_train = vec.fit_transform(train_docs)

lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X_train)

new_docs = [
    'neural network deep learning GPU training',
    'football touchdown quarterback Super Bowl',
]
X_new = vec.transform(new_docs)
topic_dist = lda.transform(X_new)
for doc, dist in zip(new_docs, topic_dist):
    label = TOPIC_LABELS.get(dist.argmax(), f'Topic {dist.argmax()}')
    print(f'{doc[:40]} -> {label} ({dist.max():.2f})')

💼 Real-World Scenario

A news publisher wants to automatically tag and categorize thousands of articles by topic to power content recommendations.

Real-World Code

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

articles = [
    'The government announced new climate change policy and carbon tax.',
    'Scientists discover breakthrough in quantum computing research.',
    'Stock markets rally as tech earnings exceed expectations.',
    'Premier League clubs prepare for summer transfer window.',
    'AI startup raises $500M for large language model development.',
    'Central bank raises interest rates to fight inflation.',
]

TOPIC_NAMES = ['Politics/Environment', 'Technology/Science', 'Finance', 'Sports']

vec = CountVectorizer(stop_words='english', min_df=1)
X = vec.fit_transform(articles)

lda = LatentDirichletAllocation(n_components=4, random_state=42)
lda.fit(X)

topic_dist = lda.transform(X)
for article, dist in zip(articles, topic_dist):
    dominant = dist.argmax()
    print(f'{TOPIC_NAMES[dominant]}: {article[:55]}')

🏋️ Practice: Customer Feedback Topics

Apply LDA to a dataset of customer feedback comments and print the top 5 words for each discovered topic.

Starter Code

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

feedback = [
    'Delivery was fast and packaging was excellent',
    'Shipping took too long and package was damaged',
    'Customer support was very helpful and responsive',
    'Support team was rude and unhelpful',
    'Product quality is amazing, well made and durable',
    'The product broke after one week, poor quality',
    'Price is reasonable for the quality you get',
    'Very expensive for what it is, not worth the money',
]

def model_topics(docs, n_topics=3):
    # TODO: CountVectorizer -> LDA -> print top words per topic
    pass

model_topics(feedback)

✅ Practice Checklist

Fit LDA with n_components=3 on a small corpus and print the top 6 words per topic
Try NMF instead of LDA on the same corpus and compare topic coherence
Iterate k from 2 to 6 and compare LDA perplexity to find the optimal topic count
Assign each document to its dominant topic and group documents by topic
Manually label each discovered topic with a descriptive name based on top words

8. Text Classification▼

Train models to classify text into categories. Covers Naive Bayes, Logistic Regression, and transformer-based fine-tuning pipelines.

Spam detection with Naive Bayes

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Minimal spam dataset
texts = [
    'Win a FREE iPhone now! Click here!!!', 'URGENT: You have won $1000',
    'Claim your prize today, limited offer!', 'Hot singles in your area',
    'Meeting at 3pm in the conference room', 'Can you review my pull request?',
    'Lunch tomorrow? Let me know.', 'Project deadline is Friday.',
    'Budget report attached for your review', 'Hi, are you free this afternoon?',
]
labels = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]  # 1=spam, 0=ham

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

clf = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test), target_names=['ham', 'spam']))

Multi-class classification with Logistic Regression

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

texts = [
    'The stock market crashed today', 'Interest rates affect mortgages',
    'Champions League final tonight', 'NBA playoffs heating up',
    'New deep learning model released', 'Python 4.0 features announced',
    'Election results pending', 'Senate votes on new bill',
]
labels = ['finance','finance','sports','sports','tech','tech','politics','politics']

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('lr',   LogisticRegression(max_iter=200))
])
scores = cross_val_score(pipe, texts, labels, cv=2, scoring='accuracy')
print(f'CV accuracy: {scores.mean():.2f} ± {scores.std():.2f}')

pipe.fit(texts, labels)
print(pipe.predict(['Bitcoin surges to all-time high']))

Zero-shot classification with transformers

try:
    from transformers import pipeline

    classifier = pipeline('zero-shot-classification',
                          model='facebook/bart-large-mnli')

    texts = [
        'The Federal Reserve raises interest rates by 25 basis points.',
        'Team wins championship after dramatic overtime goal.',
        'New AI model achieves human-level performance on benchmark.',
    ]
    candidate_labels = ['finance', 'sports', 'technology', 'politics']

    for text in texts:
        result = classifier(text, candidate_labels)
        print(f'{result["labels"][0]:12} ({result["scores"][0]:.2f}): {text[:50]}')
except ImportError:
    print('pip install transformers torch')

Feature importance for text classifiers

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np

texts = [
    'stock market bull bear portfolio dividends',
    'goal touchdown home run stadium championship',
    'algorithm neural network training dataset model',
    'vote senator election campaign policy',
] * 3
labels = ['finance','sports','tech','politics'] * 3

pipe = Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression(max_iter=200))])
pipe.fit(texts, labels)

feature_names = pipe['tfidf'].get_feature_names_out()
classes = pipe['lr'].classes_

for cls, coef in zip(classes, pipe['lr'].coef_):
    top = [feature_names[i] for i in coef.argsort()[-5:][::-1]]
    print(f'{cls}: {top}')

💼 Real-World Scenario

A news website wants to automatically route incoming press releases to the correct editorial desk (Finance, Sports, Tech, Politics) without human triage.

Real-World Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

training_data = [
    ('Company reports record quarterly earnings', 'finance'),
    ('Inflation rises as central bank meets', 'finance'),
    ('National team advances to World Cup final', 'sports'),
    ('Olympic gold medal for marathon runner', 'sports'),
    ('New open-source large language model released', 'tech'),
    ('Semiconductor company launches AI chip', 'tech'),
    ('Prime minister announces cabinet reshuffle', 'politics'),
    ('Senate approves infrastructure spending bill', 'politics'),
]

texts, labels = zip(*training_data)

router = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('lr',   LogisticRegression(max_iter=500))
])
router.fit(texts, labels)

press_releases = [
    'Startup raises $200M Series C for AI research',
    'Tennis star wins fourth Grand Slam title',
]
for pr in press_releases:
    desk = router.predict([pr])[0]
    proba = router.predict_proba([pr]).max()
    print(f'{desk.upper()} ({proba:.0%}): {pr}')

🏋️ Practice: Review Sentiment Classifier

Train a TF-IDF + Logistic Regression classifier to predict star rating buckets (1-2=negative, 3=neutral, 4-5=positive) from review text.

Starter Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

reviews = [
    ('Absolutely fantastic product, highly recommend!', 'positive'),
    ('Life changing purchase, best I have ever made', 'positive'),
    ('Pretty good but not perfect', 'neutral'),
    ('Does the job, nothing special', 'neutral'),
    ('Broken on arrival, terrible quality', 'negative'),
    ('Complete waste of money, do not buy', 'negative'),
    ('Great value for money, very happy', 'positive'),
    ('Disappointed, expected much better', 'negative'),
]

texts, labels = zip(*reviews)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# TODO: build Pipeline with TfidfVectorizer + LogisticRegression
# TODO: fit, predict, print classification_report

✅ Practice Checklist

Build a Pipeline with TfidfVectorizer + MultinomialNB and print the classification report
Try ngram_range=(1,2) in TF-IDF and compare accuracy to unigrams only
Use zero-shot classification with transformers for category labels you define at runtime
Inspect LogisticRegression coefficients to find the most predictive words per class
Run cross_val_score with cv=5 on a TF-IDF + LogReg pipeline and report mean accuracy

9. Language Models & Transformers▼

Understand transformer architecture, use pre-trained BERT/GPT models for embeddings, question answering, and text generation.

BERT embeddings for semantic search

try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    import torch.nn.functional as F

    model_name = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    def get_embedding(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze()  # [CLS] token

    s1 = get_embedding('How do I reset my password?')
    s2 = get_embedding('Steps to change account password')
    s3 = get_embedding('Best pizza recipe')

    print('s1 vs s2:', F.cosine_similarity(s1.unsqueeze(0), s2.unsqueeze(0)).item())
    print('s1 vs s3:', F.cosine_similarity(s1.unsqueeze(0), s3.unsqueeze(0)).item())
except ImportError:
    print('pip install transformers torch')

Question answering with a pipeline

try:
    from transformers import pipeline

    qa = pipeline('question-answering', model='distilbert-base-cased-distilled-squad')

    context = '''
    Python was created by Guido van Rossum and first released in 1991.
    It was designed with an emphasis on code readability and simplicity.
    Python supports multiple programming paradigms including procedural,
    object-oriented, and functional programming.
    '''

    questions = [
        'Who created Python?',
        'When was Python first released?',
        'What paradigms does Python support?',
    ]
    for q in questions:
        answer = qa({'question': q, 'context': context})
        print(f'Q: {q}')
        print(f'A: {answer["answer"]} (score: {answer["score"]:.3f})\n')
except ImportError:
    print('pip install transformers torch')

Text generation with GPT-2

try:
    from transformers import pipeline

    generator = pipeline('text-generation', model='gpt2', max_new_tokens=80)

    prompts = [
        'The future of artificial intelligence is',
        'Data science has transformed the way we',
    ]
    for prompt in prompts:
        result = generator(prompt, num_return_sequences=1, do_sample=True, temperature=0.7)
        print(f'Prompt: {prompt}')
        print(f'Generated: {result[0]["generated_text"]}\n')
except ImportError:
    print('pip install transformers torch')

Fine-tuning BERT for classification (skeleton)

try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from transformers import TrainingArguments, Trainer
    import torch

    model_name = 'distilbert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Example: tokenize a batch
    texts = ['I love this product!', 'Terrible experience.']
    labels = [1, 0]
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')

    # In real fine-tuning, wrap in a Dataset and use Trainer:
    # training_args = TrainingArguments(output_dir='./results', num_train_epochs=3)
    # trainer = Trainer(model=model, args=training_args, ...)
    # trainer.train()

    print('Model ready for fine-tuning')
    print('Tokenized input_ids shape:', encodings['input_ids'].shape)
except ImportError:
    print('pip install transformers torch datasets')

💼 Real-World Scenario

A SaaS company wants to build an internal knowledge base Q&A bot that finds answers from company documentation using semantic search.

Real-World Code

try:
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np

    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Knowledge base documents
    kb = [
        {'q': 'How do I reset my password?',
         'a': 'Go to Settings > Security > Reset Password and enter your email.'},
        {'q': 'What is the refund policy?',
         'a': 'Refunds are processed within 5-7 business days of approval.'},
        {'q': 'How do I cancel my subscription?',
         'a': 'Navigate to Billing > Subscription > Cancel Subscription.'},
    ]

    kb_embeddings = model.encode([item['q'] for item in kb])

    def answer_question(user_q: str):
        q_emb = model.encode([user_q])
        sims = cosine_similarity(q_emb, kb_embeddings)[0]
        best = sims.argmax()
        return kb[best]['a'], sims[best]

    for question in ['Change my account password', 'Get money back for purchase']:
        answer, conf = answer_question(question)
        print(f'Q: {question}')
        print(f'A: {answer} (confidence: {conf:.2f})\n')
except ImportError:
    print('pip install sentence-transformers')

🏋️ Practice: Summarizer Pipeline

Use a HuggingFace summarization pipeline to condense a long article into a 2-sentence summary and compare the word count reduction.

Starter Code

try:
    from transformers import pipeline

    summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

    article = '''
    Artificial intelligence has made remarkable progress over the past decade.
    Large language models like GPT-4 and Claude can now generate coherent text,
    answer complex questions, write code, and even reason about abstract problems.
    These models are trained on vast amounts of internet text using self-supervised
    learning, allowing them to develop broad world knowledge. However, challenges
    remain around hallucination, bias, and alignment with human values. Researchers
    continue to work on making these systems safer and more reliable.
    '''

    # TODO: use summarizer to generate a short summary (max_length=60)
    # TODO: print original word count vs summary word count

except ImportError:
    print('pip install transformers torch')

✅ Practice Checklist

Get BERT [CLS] token embeddings for two sentences and compute cosine similarity
Use the transformers QA pipeline with a context and 3 different questions
Run GPT-2 text generation with temperature=0.5 and temperature=1.2 — compare outputs
Encode a sentence with AutoTokenizer and inspect the token IDs and attention mask
Use the BART summarization pipeline and compare output length vs original word count

10. NLP Pipeline & Production▼

Combine NLP components into production-ready pipelines. Learn batching, caching, serving NLP models via API, and evaluation metrics.

End-to-end spaCy text analysis pipeline

try:
    import spacy
    from collections import Counter
    nlp = spacy.load('en_core_web_sm')

    def analyze_text(text: str) -> dict:
        doc = nlp(text)
        return {
            'word_count': len([t for t in doc if not t.is_punct]),
            'sentences':  len(list(doc.sents)),
            'entities':   [(e.text, e.label_) for e in doc.ents],
            'top_nouns':  Counter(t.lemma_ for t in doc if t.pos_ == 'NOUN').most_common(3),
            'top_verbs':  Counter(t.lemma_ for t in doc if t.pos_ == 'VERB').most_common(3),
        }

    text = 'Elon Musk launched SpaceX rockets in 2020. Tesla reported record profits in Q4.'
    result = analyze_text(text)
    for k, v in result.items():
        print(f'{k}: {v}')
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

Batch processing with nlp.pipe for efficiency

try:
    import spacy
    import time
    nlp = spacy.load('en_core_web_sm')

    texts = [f'Document {i}: Apple and Google are tech giants in Silicon Valley.' for i in range(50)]

    # Sequential processing
    t0 = time.time()
    results_seq = [nlp(t) for t in texts]
    t_seq = time.time() - t0

    # Batch processing with nlp.pipe
    t0 = time.time()
    results_batch = list(nlp.pipe(texts, batch_size=16))
    t_batch = time.time() - t0

    print(f'Sequential: {t_seq:.3f}s')
    print(f'Batch pipe: {t_batch:.3f}s')
    print(f'Speedup: {t_seq/t_batch:.1f}x')
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

Serving NLP via FastAPI

# Run with: uvicorn app:app --reload
# pip install fastapi uvicorn

FASTAPI_APP = '''
from fastapi import FastAPI
from pydantic import BaseModel
import nltk
nltk.download("vader_lexicon", quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

app = FastAPI()
sia = SentimentIntensityAnalyzer()

class TextRequest(BaseModel):
    text: str

@app.post("/sentiment")
def analyze_sentiment(req: TextRequest):
    scores = sia.polarity_scores(req.text)
    label = "positive" if scores["compound"] > 0.05 else "negative" if scores["compound"] < -0.05 else "neutral"
    return {"label": label, "scores": scores}

@app.get("/health")
def health():
    return {"status": "ok"}
'''

print(FASTAPI_APP)

Evaluation: precision, recall, F1 for NER

from sklearn.metrics import precision_recall_fscore_support, classification_report

# BIO tagging evaluation example
# True entity spans vs predicted entity spans
true_entities = [
    {('Apple', 'ORG'), ('Tim Cook', 'PERSON'), ('Cupertino', 'GPE')},
    {('Google', 'ORG'), ('Sundar Pichai', 'PERSON')},
]
pred_entities = [
    {('Apple', 'ORG'), ('Tim Cook', 'PERSON')},     # missed Cupertino
    {('Google', 'ORG'), ('Sundar Pichai', 'PERSON'), ('Mountain View', 'GPE')},  # extra FP
]

def ner_metrics(true_list, pred_list):
    tp = sum(len(t & p) for t, p in zip(true_list, pred_list))
    fp = sum(len(p - t) for t, p in zip(true_list, pred_list))
    fn = sum(len(t - p) for t, p in zip(true_list, pred_list))
    prec = tp / (tp + fp) if tp + fp > 0 else 0
    rec  = tp / (tp + fn) if tp + fn > 0 else 0
    f1   = 2 * prec * rec / (prec + rec) if prec + rec > 0 else 0
    return {'precision': prec, 'recall': rec, 'f1': f1}

print(ner_metrics(true_entities, pred_entities))

💼 Real-World Scenario

A media monitoring company processes 10,000+ news articles daily. They need a batch NLP pipeline that extracts entities, classifies topics, and stores results in a structured format.

Real-World Code

try:
    import spacy
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    import json

    nlp = spacy.load('en_core_web_sm')

    # Train topic classifier
    train_texts = [
        'stock market profits earnings', 'championship game score',
        'AI model research launch',    'election vote policy senator',
    ]
    train_labels = ['finance', 'sports', 'tech', 'politics']
    topic_clf = Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())])
    topic_clf.fit(train_texts, train_labels)

    def process_article(text: str) -> dict:
        doc = nlp(text)
        topic = topic_clf.predict([text])[0]
        return {
            'topic': topic,
            'entities': [(e.text, e.label_) for e in doc.ents],
            'word_count': len([t for t in doc if not t.is_punct]),
        }

    articles = [
        'Tesla stock surges after record Q4 earnings report.',
        'Manchester City wins Premier League with last-minute goal.',
    ]
    results = list(nlp.pipe(articles))  # batch NER
    for text, doc in zip(articles, results):
        topic = topic_clf.predict([text])[0]
        print(json.dumps({'text': text[:40], 'topic': topic,
                          'entities': [(e.text, e.label_) for e in doc.ents]}, indent=2))
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

🏋️ Practice: Text Analytics Report

Build a function that takes a list of documents and returns a JSON report with: total word count, unique entities, top 5 keywords (TF-IDF), and dominant sentiment.

Starter Code

import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np, json

documents = [
    'Apple launches revolutionary new iPhone with AI features.',
    'Google DeepMind achieves breakthrough in protein folding.',
    'Microsoft Azure reports 40% growth in cloud services revenue.',
]

def analytics_report(docs: list) -> dict:
    sia = SentimentIntensityAnalyzer()
    # TODO: compute total word count
    # TODO: extract top 5 TF-IDF keywords
    # TODO: compute average sentiment label
    # TODO: return as dict
    pass

print(json.dumps(analytics_report(documents), indent=2))

✅ Practice Checklist

Build an end-to-end pipeline: clean -> tokenize -> remove stopwords -> TF-IDF vectorize
Wrap a TF-IDF + classifier Pipeline in a class with fit() and predict() methods
Add input validation: reject empty strings and strings over 10,000 characters
Log predictions with timestamp, input length, and confidence score to a list
Write a health_report() method returning prediction distribution and error rate

11. Information Extraction▼

Extract structured facts from unstructured text: named entities, relations, events, and key-value pairs using rule-based and model-based approaches.

Regex-based relation extraction

import re

text = '''
Elon Musk founded SpaceX in 2002. Jeff Bezos founded Amazon in 1994.
Tim Cook joined Apple in 1998 and became CEO in 2011.
Sam Altman was appointed CEO of OpenAI in 2019.
'''

# Pattern: Person + founded/joined/became + Org + in + Year
founded_pat = re.compile(
    r'([A-Z][a-z]+ [A-Z][a-z]+) (founded|joined|became \w+) ([A-Z][a-zA-Z]+) in (\d{4})'
)

relations = []
for m in founded_pat.finditer(text):
    relations.append({
        'person': m.group(1),
        'relation': m.group(2),
        'org': m.group(3),
        'year': m.group(4)
    })

for r in relations:
    print(f"{r['person']} --[{r['relation']}]--> {r['org']} ({r['year']})")

spaCy NER + dependency parsing for relations

try:
    import spacy
    nlp = spacy.load('en_core_web_sm')
    text = 'Apple acquired Beats Electronics for 3 billion dollars in 2014. Google bought YouTube in 2006.'
    doc = nlp(text)
    print('Named Entities:')
    for ent in doc.ents:
        print(f'  {ent.text!r:25s} -> {ent.label_}')
    print('\nAcquisition relations (nsubj + dobj pattern):')
    for token in doc:
        if token.lemma_ in ('acquire', 'buy', 'purchase'):
            subj = [t.text for t in token.lefts  if t.dep_ in ('nsubj', 'nsubjpass')]
            obj  = [t.text for t in token.rights if t.dep_ in ('dobj', 'attr')]
            if subj and obj:
                print(f'  {subj[0]} --[{token.text}]--> {obj[0]}')
except OSError:
    print('Run: python -m spacy download en_core_web_sm')

Template-based information extraction with regex groups

import re
from dataclasses import dataclass, field
from typing import List

@dataclass
class JobPosting:
    title: str = ''
    company: str = ''
    location: str = ''
    salary: str = ''
    skills: List[str] = field(default_factory=list)

def extract_job_info(text: str) -> JobPosting:
    job = JobPosting()
    if m := re.search(r'(?:Title|Position|Role):\s*(.+)', text, re.I): job.title = m.group(1).strip()
    if m := re.search(r'Company:\s*(.+)', text, re.I): job.company = m.group(1).strip()
    if m := re.search(r'Location:\s*(.+)', text, re.I): job.location = m.group(1).strip()
    if m := re.search(r'Salary:\s*(\$[\d,]+ ?- ?\$[\d,]+|\$[\d,]+)', text, re.I): job.salary = m.group(1)
    skills_m = re.findall(r'\b(Python|SQL|Java|TensorFlow|PyTorch|Docker|Kubernetes|AWS|GCP)\b', text)
    job.skills = list(set(skills_m))
    return job

posting = '''
Title: Senior Data Scientist
Company: TechCorp Inc.
Location: San Francisco, CA
Salary: $150,000 - $200,000
Requirements: Python, SQL, TensorFlow, Docker, AWS experience preferred.
'''
job = extract_job_info(posting)
print(f'Title:    {job.title}')
print(f'Company:  {job.company}')
print(f'Location: {job.location}')
print(f'Salary:   {job.salary}')
print(f'Skills:   {sorted(job.skills)}')

Event extraction with keyword triggers

import re
from collections import defaultdict

# Simple event extraction using trigger words
EVENT_TRIGGERS = {
    'acquisition': ['acquired', 'bought', 'purchased', 'merged with', 'took over'],
    'funding':     ['raised', 'secured', 'received funding', 'closed round'],
    'launch':      ['launched', 'released', 'unveiled', 'announced', 'introduced'],
    'partnership': ['partnered', 'collaborated', 'teamed up', 'joined forces'],
}

MONEY_RE = re.compile(r'\$[\d.,]+[BMK]?\s*(?:billion|million|thousand)?', re.I)

def extract_events(sentences):
    events = []
    for sent in sentences:
        sent_lower = sent.lower()
        for etype, triggers in EVENT_TRIGGERS.items():
            for trig in triggers:
                if trig in sent_lower:
                    money = MONEY_RE.findall(sent)
                    events.append({'type': etype, 'trigger': trig, 'money': money, 'text': sent[:80]})
                    break
    return events

news = [
    'Google acquired DeepMind for $500 million in 2014.',
    'OpenAI raised $6.6 billion in its latest funding round.',
    'Apple launched its Vision Pro headset at WWDC 2023.',
    'Meta and Microsoft partnered on enterprise AI solutions.',
]
for ev in extract_events(news):
    print(f"[{ev['type'].upper()}] trigger='{ev['trigger']}' money={ev['money']}")
    print(f"  {ev['text']}")

💼 Real-World Scenario

A legal tech company needs to extract contract metadata (parties, dates, amounts, obligations) from thousands of PDF contracts to populate a contract management system automatically.

Real-World Code

import re
from typing import Optional

def extract_contract_metadata(text: str) -> dict:
    result = {}
    # Party extraction: 'between X and Y'
    party_m = re.search(r'between\s+([^,]+?)\s+and\s+([^,\.]+)', text, re.I)
    if party_m:
        result['party_1'] = party_m.group(1).strip()
        result['party_2'] = party_m.group(2).strip()
    # Date extraction
    date_m = re.findall(r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4})\b', text, re.I)
    result['dates'] = date_m[:3]
    # Amount extraction
    amounts = re.findall(r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|thousand))?', text, re.I)
    result['amounts'] = amounts
    # Obligation keywords
    obligations = re.findall(r'\b(shall|must|will|agrees to|is required to)\b', text, re.I)
    result['obligation_count'] = len(obligations)
    return result

contract = '''
This agreement is entered into between Acme Corporation and Beta Ltd.
Effective January 15, 2024. The total value is $500,000.00.
Acme shall deliver the software by March 31, 2024.
Beta must pay within 30 days of invoice.
'''
print(extract_contract_metadata(contract))

🏋️ Practice: Resume Information Extractor

Write a function that extracts key resume fields from raw text: name (first line), email, phone, years of experience (parse 'X years' patterns), and programming languages mentioned from a predefined list. Test it on a sample resume string.

Starter Code

import re

LANGUAGES = ['Python', 'Java', 'C++', 'JavaScript', 'SQL', 'R', 'Go', 'Rust', 'Scala']

def extract_resume(text: str) -> dict:
    # TODO: extract name (first non-empty line)
    # TODO: extract email
    # TODO: extract phone
    # TODO: extract years of experience
    # TODO: extract mentioned programming languages
    pass

resume = '''
Jane Doe
jane.doe@email.com | +1-555-123-4567
5 years of experience in data engineering.
Skills: Python, SQL, Scala, Apache Spark.
'''
print(extract_resume(resume))

✅ Practice Checklist

Use re.findall to extract all email addresses and phone numbers from a text block
Write a function that extracts (subject, verb, object) triples using dependency parsing
Extract all monetary amounts using a regex pattern like \$[\d,]+
Parse structured data from text using event trigger words (e.g., 'acquired', 'raised')
Build a contract metadata extractor for party names, dates, and payment amounts

12. Machine Translation & Seq2Seq▼

Understand encoder-decoder architectures, attention mechanisms, and BLEU scoring. Implement simple character-level and word-level translation concepts.

BLEU score calculation from scratch

from collections import Counter
import math

def ngram_counts(tokens, n):
    return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))

def bleu_score(reference: str, hypothesis: str, max_n: int = 4) -> float:
    ref_tokens  = reference.lower().split()
    hyp_tokens  = hypothesis.lower().split()
    if not hyp_tokens:
        return 0.0
    # Brevity penalty
    bp = min(1.0, math.exp(1 - len(ref_tokens)/len(hyp_tokens)))
    scores = []
    for n in range(1, max_n + 1):
        ref_ng  = ngram_counts(ref_tokens, n)
        hyp_ng  = ngram_counts(hyp_tokens, n)
        clipped = sum(min(c, ref_ng[ng]) for ng, c in hyp_ng.items())
        total   = sum(hyp_ng.values())
        if total == 0:
            scores.append(0.0)
        else:
            scores.append(clipped / total)
    # Geometric mean of precisions
    log_avg = sum(math.log(s) if s > 0 else -999 for s in scores) / max_n
    bleu = bp * math.exp(log_avg)
    print(f'Reference:  {reference}')
    print(f'Hypothesis: {hypothesis}')
    print(f'N-gram precisions: {[round(s,3) for s in scores]}')
    print(f'BLEU-{max_n}: {bleu:.4f}')
    return bleu

bleu_score(
    'The cat sat on the mat',
    'The cat is on the mat'
)
bleu_score(
    'The cat sat on the mat',
    'A dog lay on a rug'
)

Simple encoder-decoder concept with embeddings

import numpy as np

# Demonstrate encoder-decoder idea without deep learning framework
np.random.seed(42)

# Toy vocabulary
vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, 'hello': 3, 'world': 4,
         'hola': 5, 'mundo': 6, 'bonjour': 7, 'monde': 8}
idx2word = {v: k for k, v in vocab.items()}

# Random embeddings (in practice: learned)
EMB_DIM = 8
embeddings = np.random.randn(len(vocab), EMB_DIM) * 0.1

def encode(sentence: str) -> np.ndarray:
    tokens = [vocab.get(w, 0) for w in sentence.lower().split()]
    vecs = [embeddings[t] for t in tokens]
    return np.mean(vecs, axis=0)  # mean pooling = context vector

def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)

src = encode('hello world')
print('Encoder output (context vector):', src.round(3))

# In real seq2seq: decoder generates target tokens one by one
# conditioned on context vector + previous generated token
print('\nEncoder-Decoder flow:')
print('  Source: hello world')
print('  Encoder -> context vector (shape:', src.shape, ')')
print('  Decoder: <sos> -> hola -> mundo -> <eos>')
print('  At each step: P(word | context, prev_token)')

Attention mechanism visualization

import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# Simulate attention weights for 'The cat sat on the mat' -> 'Le chat etait sur le tapis'
np.random.seed(42)
src_words = ['The', 'cat', 'sat', 'on', 'the', 'mat']
tgt_words = ['Le', 'chat', 'etait', 'sur', 'le', 'tapis']

# Simulate attention weights (in practice: softmax(Q @ K.T / sqrt(d_k)))
# Diagonal-dominant = good alignment
raw = np.random.rand(6, 6)
# Make it more diagonal (word alignments)
for i in range(6):
    raw[i, i] += 2.0
raw[0, 4] += 1.0  # 'the' aligns with 'le'
attention = np.exp(raw) / np.exp(raw).sum(axis=1, keepdims=True)

fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(attention, cmap='Blues', vmin=0, vmax=1)
ax.set_xticks(range(6)); ax.set_xticklabels(src_words, rotation=45, ha='right')
ax.set_yticks(range(6)); ax.set_yticklabels(tgt_words)
ax.set_xlabel('Source'); ax.set_ylabel('Target')
ax.set_title('Attention Weights')
plt.colorbar(im, ax=ax); plt.tight_layout()
plt.savefig('attention_weights.png', dpi=80); plt.close()
print('Saved attention_weights.png')
print('Attention row sums:', attention.sum(axis=1).round(3))

Hugging Face translation pipeline

try:
    from transformers import pipeline
    # Zero-shot: use a pretrained translation model
    translator = pipeline('translation_en_to_fr', model='Helsinki-NLP/opus-mt-en-fr')
    sentences = [
        'Machine learning is transforming healthcare.',
        'The quick brown fox jumps over the lazy dog.',
        'Data science requires statistics, programming, and domain knowledge.',
    ]
    for sent in sentences:
        result = translator(sent, max_length=128)[0]['translation_text']
        print(f'EN: {sent}')
        print(f'FR: {result}\n')
except ImportError:
    print('pip install transformers sentencepiece')
    print('\nExample output:')
    print('EN: Machine learning is transforming healthcare.')
    print('FR: L apprentissage automatique transforme les soins de sante.')

💼 Real-World Scenario

A global e-commerce platform needs to auto-translate product descriptions from English to 5 languages, validate translation quality with BLEU against professional translations, and flag low-quality translations for human review.

Real-World Code

from collections import Counter
import math

def simple_bleu(ref: str, hyp: str) -> float:
    ref_t = ref.lower().split(); hyp_t = hyp.lower().split()
    if not hyp_t: return 0.0
    bp = min(1.0, math.exp(1 - len(ref_t)/len(hyp_t)))
    scores = []
    for n in range(1, 3):
        ref_ng  = Counter(tuple(ref_t[i:i+n]) for i in range(len(ref_t)-n+1))
        hyp_ng  = Counter(tuple(hyp_t[i:i+n]) for i in range(len(hyp_t)-n+1))
        clip    = sum(min(c, ref_ng[ng]) for ng, c in hyp_ng.items())
        total   = sum(hyp_ng.values()) or 1
        scores.append(clip/total)
    log_avg = sum(math.log(s) if s > 0 else -9 for s in scores) / 2
    return bp * math.exp(log_avg)

# Simulate auto-translations and quality check
pairs = [
    ('Machine learning improves efficiency', 'L apprentissage automatique ameliore l efficacite'),
    ('High quality product at low price', 'Produit cher mauvais qualite'),  # bad
    ('Fast delivery guaranteed', 'Livraison rapide garantie'),
]
THRESHOLD = 0.35
for en, fr in pairs:
    score = simple_bleu(en, fr)
    flag = 'REVIEW' if score < THRESHOLD else 'OK'
    print(f'[{flag}] BLEU={score:.3f} | {fr[:50]}')

🏋️ Practice: BLEU Score Evaluator

Implement a BLEU-1 and BLEU-2 evaluator. Given a list of (reference, hypothesis) pairs, compute per-pair BLEU scores and the corpus-level BLEU (average). Flag translations below 0.4 for human review. Test with at least 3 sentence pairs.

Starter Code

from collections import Counter
import math

def bleu_n(ref: str, hyp: str, n: int) -> float:
    # TODO: compute BLEU-n precision with clipping
    pass

def evaluate_translations(pairs):
    # pairs: list of (reference, hypothesis) tuples
    # TODO: compute BLEU-1 and BLEU-2 for each pair
    # TODO: flag pairs below threshold 0.4
    # TODO: print results and corpus average
    pass

test_pairs = [
    ('The cat sat on the mat', 'The cat is on the mat'),
    ('Hello world how are you', 'Hi earth what is up'),
    ('Data science is exciting', 'Data science is fascinating and rewarding'),
]
evaluate_translations(test_pairs)

✅ Practice Checklist

Use the HuggingFace translation pipeline to translate 3 sentences from English to French
Compute BLEU-1 score manually between a reference and hypothesis sentence
Compare two translations of the same sentence with BLEU-2 — which is closer?
Build a seq2seq tokenization demo: encode source, decode target, show token IDs
Flag translations with BLEU score below 0.4 and print them for human review

13. Document Search & RAG▼

Build retrieval systems using TF-IDF, BM25, and dense vector search. Implement a basic Retrieval-Augmented Generation pipeline combining a retriever with a language model.

TF-IDF retrieval system

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

corpus = [
    'Python is a versatile programming language for data science and web development.',
    'Machine learning models require large amounts of training data.',
    'Neural networks are inspired by the structure of the human brain.',
    'Natural language processing enables computers to understand human text.',
    'Deep learning achieves state-of-the-art results on image classification tasks.',
    'Reinforcement learning trains agents through rewards and penalties.',
    'Transfer learning fine-tunes pre-trained models on new tasks.',
    'Transformers use self-attention to process sequences in parallel.',
]

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)

def search(query: str, top_k: int = 3) -> list:
    q_vec = vectorizer.transform([query])
    sims  = cosine_similarity(q_vec, tfidf_matrix).flatten()
    top   = np.argsort(sims)[::-1][:top_k]
    return [(corpus[i][:70], round(sims[i], 4)) for i in top]

for q in ['how do neural networks learn', 'NLP text processing', 'image recognition']:
    print(f'Query: {q}')
    for doc, score in search(q):
        print(f'  [{score:.4f}] {doc}')
    print()

BM25 retrieval from scratch

import numpy as np
from collections import Counter
import math

class BM25:
    def __init__(self, corpus, k1=1.5, b=0.75):
        self.corpus  = [doc.lower().split() for doc in corpus]
        self.k1, self.b = k1, b
        self.n = len(self.corpus)
        self.avgdl = np.mean([len(d) for d in self.corpus])
        self.df = {}
        for doc in self.corpus:
            for term in set(doc):
                self.df[term] = self.df.get(term, 0) + 1

    def score(self, query: str, doc_id: int) -> float:
        query_terms = query.lower().split()
        doc = self.corpus[doc_id]
        doc_len = len(doc)
        tf = Counter(doc)
        score = 0.0
        for term in query_terms:
            if term not in self.df: continue
            idf = math.log((self.n - self.df[term] + 0.5) / (self.df[term] + 0.5) + 1)
            freq = tf.get(term, 0)
            tf_score = freq * (self.k1 + 1) / (freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))
            score += idf * tf_score
        return score

    def retrieve(self, query: str, top_k: int = 3):
        scores = [(i, self.score(query, i)) for i in range(self.n)]
        return sorted(scores, key=lambda x: -x[1])[:top_k]

docs = ['Python machine learning tutorial', 'Deep neural network architectures', 'Python web scraping guide', 'Transformer models for NLP', 'Data science with Python pandas']
bm25 = BM25(docs)
print('BM25 results for "Python NLP":')
for idx, score in bm25.retrieve('Python NLP'):
    print(f'  [{score:.3f}] {docs[idx]}')

Dense vector search with sentence embeddings

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Simulate sentence embeddings (in practice: use SentenceTransformer)
np.random.seed(42)

docs = [
    'How to train a neural network',
    'Python list comprehension tutorial',
    'Best practices for REST API design',
    'Introduction to gradient descent optimization',
    'SQL window functions explained',
    'Backpropagation algorithm explained',
]

# Simulate embeddings (normally: model.encode(docs))
# Make 'neural network' and 'gradient descent' semantically similar
EMB_DIM = 16
base_embs = np.random.randn(len(docs), EMB_DIM)
# Make neural network docs cluster together
for i in [0, 3, 5]:
    base_embs[i] += np.array([2]*4 + [0]*12)  # shared direction
base_embs /= np.linalg.norm(base_embs, axis=1, keepdims=True)

def dense_search(query_emb, doc_embs, top_k=3):
    sims = cosine_similarity(query_emb.reshape(1,-1), doc_embs).flatten()
    top  = np.argsort(sims)[::-1][:top_k]
    return [(docs[i], sims[i]) for i in top]

# Query embedding (similar to NN docs)
query_emb = base_embs[0] + np.random.randn(EMB_DIM) * 0.1
query_emb /= np.linalg.norm(query_emb)
print('Dense search results for [neural network query]:')
for doc, sim in dense_search(query_emb, base_embs):
    print(f'  [{sim:.3f}] {doc}')

Minimal RAG pipeline (retrieve + generate)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Knowledge base
knowledge_base = [
    'Python was created by Guido van Rossum in 1991.',
    'NumPy provides N-dimensional array support and math functions.',
    'Pandas is built on NumPy and provides DataFrame structures for data analysis.',
    'Scikit-learn offers machine learning algorithms for classification, regression, and clustering.',
    'Matplotlib is the most popular Python plotting library.',
    'TensorFlow and PyTorch are the two leading deep learning frameworks.',
]

vect = TfidfVectorizer(stop_words='english')
kb_matrix = vect.fit_transform(knowledge_base)

def retrieve(query: str, top_k: int = 2) -> list:
    q_vec = vect.transform([query])
    sims  = cosine_similarity(q_vec, kb_matrix).flatten()
    top   = np.argsort(sims)[::-1][:top_k]
    return [knowledge_base[i] for i in top]

def rag_answer(query: str) -> str:
    """Minimal RAG: retrieve context, then format answer."""
    context = retrieve(query)
    context_str = ' '.join(context)
    # In real RAG: pass context + query to LLM (e.g. Claude/GPT)
    # Here: template-based answer simulation
    answer = f'Based on retrieved context: {context_str[:120]}...'
    return answer

queries = ['What is pandas?', 'Who created Python?', 'deep learning frameworks']
for q in queries:
    print(f'Q: {q}')
    print(f'Retrieved: {retrieve(q)[0][:60]}')
    print()

💼 Real-World Scenario

Build an internal company knowledge base search that answers employee questions by retrieving relevant policy documents, FAQs, and procedure guides, then generating a concise answer using the retrieved context.

Real-World Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Company policy documents
documents = {
    'vacation':   'Employees are entitled to 20 days of paid vacation per year. Unused days can be carried over to the next year up to 5 days.',
    'remote':     'Remote work is allowed up to 3 days per week. Core hours are 10am-3pm in the employee home timezone.',
    'expenses':   'Business expenses must be submitted within 30 days with receipts. Meals are reimbursed up to $50 per day.',
    'equipment':  'New employees receive a MacBook Pro and $500 equipment budget. Replacements require manager approval.',
    'onboarding': 'New employees complete a 2-week onboarding program including security training and team introductions.',
}

doc_texts = list(documents.values())
doc_keys  = list(documents.keys())
vect = TfidfVectorizer(stop_words='english')
matrix = vect.fit_transform(doc_texts)

def answer_question(query: str, top_k: int = 2) -> str:
    sims = cosine_similarity(vect.transform([query]), matrix).flatten()
    top  = np.argsort(sims)[::-1][:top_k]
    context = ' '.join(doc_texts[i] for i in top)
    return f'[Context: {context[:200]}...]'

for q in ['How many vacation days do I get?', 'Can I work from home?', 'expense reimbursement policy']:
    print(f'Q: {q}')
    print(f'A: {answer_question(q)[:120]}')
    print()

🏋️ Practice: FAQ Chatbot with TF-IDF Retrieval

Build a simple FAQ chatbot. Given a list of (question, answer) pairs as your knowledge base, retrieve the most similar FAQ question to user input using TF-IDF + cosine similarity, and return its answer. Test with at least 5 FAQ entries and 3 user queries. Report similarity scores.

Starter Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

faqs = [
    ('What are your business hours?', 'We are open Monday to Friday, 9am to 6pm EST.'),
    ('How do I reset my password?', 'Click Forgot Password on the login page and follow the email instructions.'),
    ('What payment methods do you accept?', 'We accept Visa, Mastercard, PayPal, and bank transfers.'),
    ('How long does shipping take?', 'Standard shipping takes 5-7 business days. Express ships in 2 days.'),
    ('Can I return a product?', 'Yes, returns are accepted within 30 days with original packaging.'),
]

# TODO: Fit TfidfVectorizer on FAQ questions
# TODO: For each user query, find most similar FAQ and return answer
# TODO: Print query, matched question, similarity score, and answer

user_queries = ['office hours', 'forgot my login', 'how to send back item']
# TODO: process each query

✅ Practice Checklist

Build a TF-IDF retrieval system and return the top-3 results for a query with scores
Implement BM25 from scratch and compare results to TF-IDF on the same queries
Build a simple RAG: retrieve top-2 passages then fill an answer template with them
Add a similarity threshold — return 'No answer found' when max score is below 0.1
Index 10 FAQ Q&A pairs and test 5 user queries — measure how often correct answer is top-1

14. Transformer Models & Pre-trained Pipelines▼

Tokenization & Subword Encoding with HuggingFace

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
texts = [
    "The stock market crashed on Monday.",
    "Transformers revolutionized NLP in 2017!"
]
encoded = tokenizer(texts, padding=True, truncation=True, max_length=32, return_tensors="pt")
print("Input IDs shape:", encoded["input_ids"].shape)
for i, text in enumerate(texts):
    tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][i])
    tokens = [t for t in tokens if t != "[PAD]"]
    print(f"\nText {i+1}: {tokens}")
print("\nVocab size:", tokenizer.vocab_size)

Sentiment Classification with Pre-trained BERT

from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
reviews = [
    "This product exceeded all my expectations! Absolutely fantastic.",
    "Terrible quality. Broke after one day. Very disappointed.",
    "It is okay, nothing special but gets the job done.",
]
results = classifier(reviews)
for text, result in zip(reviews, results):
    label = result["label"]
    score = result["score"]
    print(f"[{label} {score:.3f}] {text[:50]}...")

Zero-Shot Classification

from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
text = "The Federal Reserve raised interest rates by 25 basis points today."
candidate_labels = ["finance", "sports", "technology", "politics", "health"]
result = classifier(text, candidate_labels)
print("Text:", text[:70])
print("\nClassification scores:")
for label, score in zip(result["labels"], result["scores"]):
    bar = "#" * int(score * 30)
    print(f"  {label:<12} {score:.4f}  {bar}")

💼 Real-World Scenario

Customer support triage: classify incoming support tickets into departments (billing, technical, returns, general) using zero-shot classification without any labeled training data.

Real-World Code

from transformers import pipeline
# Zero-shot ticket router
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
tickets = [
    "My credit card was charged twice for the same order.",
    "The app keeps crashing whenever I try to open settings.",
    "I want to return the shoes I bought last week. They don't fit.",
    "When will my order arrive? It's been two weeks.",
    "I forgot my password and the reset link doesn't work.",
]
departments = ["billing", "technical support", "returns & refunds", "shipping", "account access"]
print("Support Ticket Routing")
print("=" * 60)
for ticket in tickets:
    result = classifier(ticket, departments, multi_label=False)
    top_dept = result["labels"][0]
    top_score = result["scores"][0]
    print(f"Ticket: {ticket[:55]}...")
    print(f"  -> {top_dept} ({top_score:.3f})")
    print()

🏋️ Practice: News Article Classifier

Use zero-shot classification with 6 news categories (politics, sports, technology, science, entertainment, business). Classify 5 different news headlines. Then use a pre-trained sentiment pipeline on the same headlines and combine both outputs into a structured report showing category + sentiment for each article.

Starter Code

from transformers import pipeline
# News headlines to classify
headlines = [
    "SpaceX successfully lands reusable rocket for 20th time.",
    "Champions League final set as Real Madrid beats Bayern Munich.",
    "Senate votes to pass new climate legislation bill.",
    "Apple unveils new M4 chip with enhanced neural processing.",
    "GDP growth slows to 1.2% amid rising inflation concerns.",
]
categories = ["politics", "sports", "technology", "science", "entertainment", "business"]
# TODO: Zero-shot classify each headline into categories
# TODO: Run sentiment analysis on each headline
# TODO: Print formatted table: headline | category | sentiment | scores

✅ Practice Checklist

Tokenize a sentence with AutoTokenizer and print the subword tokens and IDs
Run the distilbert sentiment pipeline on 5 reviews and print label + confidence
Use zero-shot classification with 5 custom labels on a news article
Load a pre-trained NER pipeline and extract entities from a paragraph
Chain sentiment + zero-shot pipelines to classify and label the same text

15. Named Entity Recognition & Information Extraction▼

spaCy NER Pipeline

import spacy
nlp = spacy.load("en_core_web_sm")
texts = [
    "Apple Inc. CEO Tim Cook announced new products at WWDC in San Francisco.",
    "On March 14, 2023, the Fed raised rates by 25bps, affecting $4.5T in bonds.",
]
for text in texts:
    doc = nlp(text)
    print(f"Text: {text[:65]}...")
    print("Entities:")
    for ent in doc.ents:
        print(f"  [{ent.label_:<10}] '{ent.text}'")
    print()

Custom NER with spaCy Patterns

import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Match product codes like "SKU-12345" or "PROD-ABC99"
pattern = [{"TEXT": {"REGEX": r"(SKU|PROD|ITEM)-[A-Z0-9]{3,8}"}}]
matcher.add("PRODUCT_CODE", [pattern])
texts = [
    "Customer ordered SKU-48291 and PROD-XR99 but ITEM-ZZ001 was out of stock.",
    "Return request for SKU-11100 received from warehouse.",
]
for text in texts:
    doc = nlp(text)
    matches = matcher(doc)
    codes = [doc[start:end].text for _, start, end in matches]
    print(f"Text: {text}")
    print(f"Product codes found: {codes}\n")

Relation Extraction with Dependency Parsing

import spacy
nlp = spacy.load("en_core_web_sm")
text = "Elon Musk founded SpaceX in 2002. Jeff Bezos started Amazon in 1994."
doc = nlp(text)
print("Subject-Verb-Object triples:")
for sent in doc.sents:
    for token in sent:
        if token.dep_ == "ROOT":
            subj = [c.text for c in token.children if c.dep_ in ("nsubj","nsubjpass")]
            obj  = [c.text for c in token.children if c.dep_ in ("dobj","attr","pobj")]
            if subj and obj:
                print(f"  ({subj[0]}) --[{token.text}]--> ({obj[0]})")
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"  {ent.text:<15} [{ent.label_}]")

💼 Real-World Scenario

Legal contract analysis: extract parties, dates, monetary amounts, and obligations from contract text to populate a structured database automatically.

Real-World Code

import spacy
import re
nlp = spacy.load("en_core_web_sm")
contract_text = """
This Service Agreement is entered into on January 15, 2024, between
Acme Corporation, a Delaware company ("Client"), and TechSolutions LLC,
a California limited liability company ("Provider"). Client agrees to pay
Provider $12,500 per month for software development services. The agreement
terminates on December 31, 2024. Acme Corporation is headquartered in
New York, NY. Either party may terminate with 30 days written notice.
"""
doc = nlp(contract_text)
# Extract entities by type
parties = []
dates = []
money = []
for ent in doc.ents:
    if ent.label_ == "ORG":
        parties.append(ent.text)
    elif ent.label_ == "DATE":
        dates.append(ent.text)
    elif ent.label_ == "MONEY":
        money.append(ent.text)
print("Contract Extraction Report")
print(f"Parties:  {list(set(parties))}")
print(f"Dates:    {dates}")
print(f"Amounts:  {money}")
# Extract obligations with regex on sentence level
for sent in doc.sents:
    if any(w in sent.text.lower() for w in ["agrees", "shall", "must", "terminates"]):
        print(f"Obligation: {sent.text.strip()[:80]}")

🏋️ Practice: Resume Information Extractor

Given a sample resume text (3-4 sentences), use spaCy to extract: person name (PERSON), organizations (ORG), job titles (using custom Matcher patterns for 'Senior Engineer', 'Data Scientist', etc.), years of experience (CARDINAL + 'years'), and skills (custom pattern for capitalized tech terms). Output a structured JSON-like summary.

Starter Code

import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
resume = """
John Smith is a Senior Data Scientist with 8 years of experience at Google and Microsoft.
He specializes in Python, TensorFlow, and SQL. Previously, he was a Machine Learning Engineer
at Amazon, where he led a team of 5 researchers. He holds a PhD from MIT in Computer Science.
"""
matcher = Matcher(nlp.vocab)
# TODO: Pattern for job titles (e.g., "Senior Data Scientist", "Machine Learning Engineer")
# TODO: Pattern for tech skills (capitalized 1-3 word terms)
# TODO: Extract PERSON, ORG, DATE, CARDINAL entities
# TODO: Output structured dict: name, companies, titles, skills, experience_years

✅ Practice Checklist

Use spaCy NER to extract PERSON and ORG entities from a resume paragraph
Add a Matcher pattern for job titles like 'Senior Engineer' or 'Data Scientist'
Extract '\d+ years' patterns from text using re.findall and parse the numbers
Build a custom EntityRuler to recognize programming language names in text
Output a structured dict with name, companies, titles, skills, and years extracted

16. Text Generation, Summarization & Prompt Engineering▼

Text Generation with GPT-2

from transformers import pipeline
generator = pipeline("text-generation", model="gpt2", max_new_tokens=60)
prompts = [
    "The future of artificial intelligence is",
    "In 2035, data scientists will",
]
for prompt in prompts:
    outputs = generator(prompt, num_return_sequences=2, temperature=0.8, do_sample=True)
    print(f"Prompt: {prompt}")
    for i, out in enumerate(outputs, 1):
        generated = out["generated_text"][len(prompt):]
        print(f"  [{i}] ...{generated[:80]}")
    print()

Structured Prompting & Prompt Engineering

# Demonstrates prompt engineering patterns (no API key needed — shows templates)
import json

def build_extraction_prompt(text, fields):
    field_list = ", ".join(f'"{f}"' for f in fields)
    return f"""Extract the following fields from the text below.
Return ONLY valid JSON with keys: {field_list}.
If a field is not found, use null.

Text: {text}

JSON output:"""

texts = [
    "Order #4521 placed by Sarah Johnson on 2024-03-15 for $289.99. Ships to Chicago, IL.",
    "Meeting scheduled with Dr. Patel at Boston General Hospital on Tuesday at 2pm."
]
fields_order   = ["order_id", "customer_name", "date", "amount", "city"]
fields_meeting = ["person", "organization", "day", "time"]
for text, fields in zip(texts, [fields_order, fields_meeting]):
    prompt = build_extraction_prompt(text, fields)
    print("=== Prompt Template ===")
    print(prompt[:200])
    print("...")
    print()

Summarization with BART

from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn",
                      max_length=60, min_length=20, do_sample=False)
article = """
Scientists at MIT have developed a new AI system capable of predicting protein
folding structures with greater accuracy than any previous model. The breakthrough,
published in Nature, combines deep learning with molecular dynamics simulations.
Researchers tested the system on over 10,000 known protein structures and achieved
98.5% accuracy. This development could accelerate drug discovery by enabling
researchers to design proteins that target specific disease pathways. The team plans
to make the model open-source within the next six months.
"""
summary = summarizer(article.strip())[0]["summary_text"]
original_words = len(article.split())
summary_words  = len(summary.split())
print(f"Original: {original_words} words")
print(f"Summary ({summary_words} words):")
print(summary)

💼 Real-World Scenario

Content moderation pipeline: automatically detect and summarize problematic content, classify severity, and route to appropriate human reviewer with context.

Real-World Code

from transformers import pipeline
# Multi-stage NLP pipeline: classify -> summarize -> route
classifier  = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
zero_shot   = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
posts = [
    "This is an amazing product! I love the design and performance.",
    "I hate this company. They stole my money and won't respond.",
    "How do I reset my password? I can't log in to my account.",
    "WARNING: This is a scam. Do NOT buy from this seller!!",
]
severity_labels = ["urgent - requires immediate review", "moderate - review within 24h", "low - can be auto-resolved"]
print("Content Moderation Pipeline")
print("=" * 60)
for post in posts:
    sentiment = classifier(post)[0]
    severity  = zero_shot(post, severity_labels)["labels"][0]
    print(f"Post: {post[:55]}...")
    print(f"  Sentiment: {sentiment['label']} ({sentiment['score']:.2f})")
    print(f"  Severity:  {severity}")
    print()

🏋️ Practice: Multi-Document Summarization

Summarize 3 different news articles (each 100+ words, on the same topic) using BART. Then concatenate the summaries and summarize again to create a 'meta-summary'. Compare word counts at each stage and compute the compression ratio. Also extract key noun phrases from the meta-summary using spaCy.

Starter Code

from transformers import pipeline
import spacy
nlp_sp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn",
                      max_length=80, min_length=30, do_sample=False)
article1 = """[Article 1: 100+ words on climate change - fill in]""""
article2 = """[Article 2: 100+ words on climate policy - fill in]""""
article3 = """[Article 3: 100+ words on renewable energy - fill in]""""
articles = [article1, article2, article3]
# TODO: Summarize each article individually
# TODO: Concatenate summaries and create meta-summary
# TODO: Compute compression ratios at each stage
# TODO: Extract noun chunks from meta-summary with spaCy

✅ Practice Checklist

Use GPT-2 to generate 2 completions from the same prompt with different temperatures
Build a prompt template function that formats fields into a structured extraction prompt
Summarize an article with BART and compute the word-count compression ratio
Chain classify -> summarize -> route in a single function on 4 moderation inputs
Extract noun chunks from a summary using spaCy and rank them by frequency

17. Named Entity Recognition (NER)▼

NER identifies and classifies named entities (persons, organizations, locations, dates) in text using spaCy or Transformers.

spaCy NER

import spacy
from collections import Counter

# Load spaCy model (run: python -m spacy download en_core_web_sm)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Run: python -m spacy download en_core_web_sm")
    nlp = None

text = (
    "Apple Inc. CEO Tim Cook announced in San Francisco on January 15, 2024 "
    "that the company would invest $1 billion in AI research. "
    "The partnership with OpenAI and Microsoft was confirmed by Google."
)

if nlp:
    doc = nlp(text)
    print("Named Entities:")
    for ent in doc.ents:
        print(f"  {ent.text:<30} [{ent.label_}] - {spacy.explain(ent.label_)}")

    # Count entity types
    type_counts = Counter(ent.label_ for ent in doc.ents)
    print("\nEntity type counts:", dict(type_counts))
else:
    # Simulate output structure
    entities = [
        ("Apple Inc.", "ORG", "Companies"), ("Tim Cook", "PERSON", "People"),
        ("San Francisco", "GPE", "Geo-political"), ("January 15, 2024", "DATE", "Dates"),
        ("$1 billion", "MONEY", "Monetary"), ("OpenAI", "ORG", "Companies"),
        ("Microsoft", "ORG", "Companies"), ("Google", "ORG", "Companies"),
    ]
    for text_ent, label, explain in entities:
        print(f"  {text_ent:<30} [{label}] - {explain}")

Custom NER with spaCy Ruler

import spacy
from spacy.language import Language

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    nlp = spacy.blank("en")

# Add EntityRuler for custom entities
ruler = nlp.add_pipe("entity_ruler", before="ner" if "ner" in nlp.pipe_names else "last")

# Define custom patterns
patterns = [
    {"label": "ML_MODEL", "pattern": "BERT"},
    {"label": "ML_MODEL", "pattern": "GPT-4"},
    {"label": "ML_MODEL", "pattern": "ResNet"},
    {"label": "ML_TASK", "pattern": "named entity recognition"},
    {"label": "ML_TASK", "pattern": "sentiment analysis"},
    {"label": "DATASET", "pattern": [{"LOWER": "imagenet"}]},
    {"label": "DATASET", "pattern": "CIFAR-10"},
]
ruler.add_patterns(patterns)

test_texts = [
    "BERT and GPT-4 are popular ML models for named entity recognition.",
    "ResNet was trained on ImageNet and CIFAR-10 datasets.",
    "sentiment analysis using BERT achieves state-of-the-art results.",
]

for text in test_texts:
    doc = nlp(text)
    ents = [(e.text, e.label_) for e in doc.ents]
    print(f"Text: {text[:50]}...")
    print(f"  Entities: {ents}\n")

💼 Real-World Scenario

Your company processes thousands of customer support tickets daily. You need to extract product names, error codes, and customer IDs to route tickets automatically.

Real-World Code

import re
from collections import defaultdict

# Rule-based NER for support tickets (when spaCy not available)
PATTERNS = {
    "PRODUCT": [r"\b(Model-[A-Z]\d+|Product-\w+|SKU-\d+)\b"],
    "ERROR_CODE": [r"\bERR-?\d{3,5}\b", r"\bError\s+\d{3,5}\b"],
    "TICKET_ID": [r"\b(TKT|TICKET)-?\d{5,8}\b"],
    "VERSION": [r"\bv\d+\.\d+(\.\d+)?\b"],
}

def extract_entities(text):
    entities = defaultdict(list)
    for label, pats in PATTERNS.items():
        for pat in pats:
            matches = re.findall(pat, text, re.IGNORECASE)
            if matches:
                flat = [m if isinstance(m, str) else m[0] for m in matches]
                entities[label].extend(flat)
    return dict(entities)

tickets = [
    "TKT-123456: Customer reports ERR-4042 on Model-X9 running v2.3.1",
    "TICKET-99887: SKU-A2B3C4 throws Error 500 after upgrade to v3.0.0",
    "TKT00112233: Product-Premium shows ERR4001 and ERR4002 intermittently",
]

for ticket in tickets:
    ents = extract_entities(ticket)
    print(f"Ticket: {ticket}")
    print(f"  Entities: {ents}\n")

🏋️ Practice: Extract entities from text

Use re.findall with named patterns to extract PERSON, ORG, and DATE-like patterns from a text string.

Starter Code

import re
text = "John Smith joined Acme Corp on 2024-01-15 and Microsoft on 2024-03-20."
dates = re.findall(r"\d{4}-\d{2}-\d{2}", text)
names = re.findall(r"[A-Z][a-z]+ [A-Z][a-z]+", text)
print("Dates:", dates)
print("Names:", names)

18. Sentence Embeddings & Semantic Search▼

Sentence embeddings convert text to dense vectors capturing semantic meaning, enabling similarity search beyond keyword matching.

TF-IDF & Cosine Similarity

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Document corpus
corpus = [
    "machine learning algorithms for classification",
    "deep learning neural networks for image recognition",
    "natural language processing text classification",
    "computer vision object detection algorithms",
    "transformer models for text generation",
    "reinforcement learning reward optimization",
]

# Build TF-IDF matrix
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)
print(f"TF-IDF matrix: {tfidf_matrix.shape}")

# Semantic search function
def search(query, top_k=3):
    query_vec = vectorizer.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix)[0]
    top_idx = np.argsort(scores)[::-1][:top_k]
    return [(corpus[i], round(float(scores[i]), 4)) for i in top_idx]

queries = [
    "text classification with deep learning",
    "visual recognition algorithms",
]
for q in queries:
    print(f"\nQuery: '{q}'")
    for doc, score in search(q):
        print(f"  [{score:.4f}] {doc}")

Word2Vec-Style Embeddings

import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create co-occurrence matrix (simplified Word2Vec concept)
sentences = [
    "the cat sat on the mat",
    "the dog lay on the rug",
    "cats and dogs are pets",
    "machine learning models learn patterns",
    "deep learning uses neural networks",
    "neural networks learn representations",
]

# Build co-occurrence proxy via SVD on count matrix
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(sentences)

# SVD to get dense embeddings (like word2vec)
svd = TruncatedSVD(n_components=8, random_state=42)
embeddings = svd.fit_transform(X)

print("Sentence embeddings shape:", embeddings.shape)

# Find similar sentences
def find_similar(idx, top_k=3):
    sims = cosine_similarity([embeddings[idx]], embeddings)[0]
    sims[idx] = -1  # exclude self
    top = np.argsort(sims)[::-1][:top_k]
    return [(sentences[i], round(float(sims[i]), 4)) for i in top]

for i in [0, 3]:
    print(f"\nSimilar to: '{sentences[i]}'")
    for sent, sim in find_similar(i):
        print(f"  [{sim:.4f}] {sent}")

💼 Real-World Scenario

Your FAQ system returns irrelevant results because it uses keyword matching. You need semantic search that understands 'password reset' and 'forgot credentials' are similar.

Real-World Code

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# FAQ knowledge base
faqs = [
    ("How do I reset my password?", "Click Forgot Password on login page, enter email, check inbox."),
    ("How to change account email?", "Go to Settings > Account > Email and enter new address."),
    ("Why is my payment failing?", "Check card details, billing address, or try a different card."),
    ("How to cancel my subscription?", "Go to Settings > Billing > Cancel Subscription."),
    ("How do I download my invoice?", "Settings > Billing > Invoice History > Download PDF."),
    ("Account locked after failed logins?", "Wait 30 minutes or contact support@example.com."),
]

questions = [q for q, _ in faqs]
answers = [a for _, a in faqs]

vec = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)
faq_matrix = vec.fit_transform(questions)

def answer_query(user_query, threshold=0.1):
    q_vec = vec.transform([user_query])
    scores = cosine_similarity(q_vec, faq_matrix)[0]
    best_idx = np.argmax(scores)
    if scores[best_idx] < threshold:
        return "I couldn't find a relevant answer. Please contact support."
    return f"[score={scores[best_idx]:.3f}] {answers[best_idx]}"

test_queries = [
    "forgot my credentials",
    "payment not working",
    "stop my plan",
    "get billing document",
]
for q in test_queries:
    print(f"Q: {q}")
    print(f"A: {answer_query(q)}\n")

🏋️ Practice: Build a semantic search function

Use TfidfVectorizer and cosine_similarity to find the most similar document to a query.

Starter Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

docs = ["python programming", "machine learning python", "deep learning neural nets"]
vec = TfidfVectorizer()
X = vec.fit_transform(docs)
query = vec.transform(["python learning"])
scores = cosine_similarity(query, X)[0]
print("Best match:", docs[np.argmax(scores)])

19. Hugging Face Transformers▼

Hugging Face Transformers provides thousands of pretrained models for text classification, generation, Q&A, and more via a unified API.

Sentiment Analysis Pipeline

# pip install transformers torch
# Using Hugging Face pipeline (simulated output shown)
try:
    from transformers import pipeline

    # Zero-shot classification (no fine-tuning needed)
    classifier = pipeline("zero-shot-classification",
                          model="facebook/bart-large-mnli")

    texts = [
        "The product quality is excellent and delivery was fast!",
        "Terrible service, waited 3 weeks and got wrong item.",
        "The item is okay, nothing special but works as expected.",
    ]

    candidate_labels = ["positive", "negative", "neutral"]

    for text in texts:
        result = classifier(text, candidate_labels)
        top_label = result["labels"][0]
        top_score = result["scores"][0]
        print(f"Text: {text[:50]}...")
        print(f"  -> {top_label} (score={top_score:.4f})")

except ImportError:
    # Simulate output structure
    results = [
        ("positive", 0.9823), ("negative", 0.9541), ("neutral", 0.7234)
    ]
    texts = ["Excellent product!", "Terrible service.", "Works okay."]
    for (text, (label, score)) in zip(texts, results):
        print(f"Text: {text}")
        print(f"  -> {label} (score={score:.4f})")

Token Classification & Feature Extraction

# Token classification and feature extraction patterns
try:
    from transformers import pipeline, AutoTokenizer, AutoModel
    import torch

    # NER pipeline
    ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english",
                   aggregation_strategy="simple")
    text = "Apple CEO Tim Cook announced a deal with Microsoft in New York."
    entities = ner(text)
    for ent in entities:
        print(f"  {ent['word']:<20} {ent['entity_group']:<10} score={ent['score']:.4f}")

except ImportError:
    # Demonstrate the pipeline usage pattern
    import numpy as np
    print("Simulated NER output (install transformers for real results):")
    entities = [
        {"word": "Apple", "entity_group": "ORG", "score": 0.9987},
        {"word": "Tim Cook", "entity_group": "PER", "score": 0.9945},
        {"word": "Microsoft", "entity_group": "ORG", "score": 0.9978},
        {"word": "New York", "entity_group": "LOC", "score": 0.9923},
    ]
    for ent in entities:
        print(f"  {ent['word']:<20} {ent['entity_group']:<10} score={ent['score']:.4f}")

    # Simulate sentence embeddings
    print("\nSimulating sentence embeddings (mean pooling over tokens):")
    batch_size, seq_len, hidden = 2, 128, 768
    token_embeddings = np.random.randn(batch_size, seq_len, hidden)
    sentence_embeddings = token_embeddings.mean(axis=1)
    print(f"  Input shape: {token_embeddings.shape}")
    print(f"  Sentence embedding shape: {sentence_embeddings.shape}")

💼 Real-World Scenario

Your customer feedback system processes 10,000 reviews daily. You need to classify sentiment, extract product aspects, and identify key topics without building models from scratch.

Real-World Code

# Using sklearn to simulate transformer-like text classification
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Simulated reviews dataset
reviews = [
    ("Great product, fast shipping, very satisfied!", "positive"),
    ("Amazing quality, exceeded expectations.", "positive"),
    ("Good value for money, would recommend.", "positive"),
    ("Works as described, happy with purchase.", "positive"),
    ("Terrible quality, broke after one day.", "negative"),
    ("Do not buy this, complete waste of money.", "negative"),
    ("Very disappointed, nothing like the description.", "negative"),
    ("Poor build quality, returned immediately.", "negative"),
    ("Item is okay, nothing special.", "neutral"),
    ("Average product, does the job.", "neutral"),
    ("Received the item, it works.", "neutral"),
    ("Product is fine, shipping was slow.", "neutral"),
]

texts, labels = zip(*reviews)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.33, random_state=42)

# Pipeline (simulates HF pipeline interface)
clf = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
    ("model", LogisticRegression(max_iter=1000, random_state=42)),
])
clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))

# Batch prediction (like HF pipeline)
new_reviews = [
    "Absolutely love this product!",
    "Received damaged, very unhappy.",
    "It is what it is, does the job.",
]
for review in new_reviews:
    pred = clf.predict([review])[0]
    prob = max(clf.predict_proba([review])[0])
    print(f"  [{pred:>8}] ({prob:.3f}) {review}")

🏋️ Practice: Build a text classifier pipeline

Use sklearn Pipeline with TfidfVectorizer and LogisticRegression to classify text into categories.

Starter Code

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

texts = ["I love this", "I hate this", "This is great", "This is terrible"]
labels = ["pos", "neg", "pos", "neg"]
clf = Pipeline([("tfidf", TfidfVectorizer()), ("nb", MultinomialNB())])
clf.fit(texts, labels)
print(clf.predict(["This is amazing"]))

20. Topic Modeling with LDA▼

Latent Dirichlet Allocation (LDA) discovers hidden topics in a text corpus by modeling documents as mixtures of topics.

LDA with Gensim

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# Sample corpus
documents = [
    "machine learning neural networks deep learning artificial intelligence",
    "python programming data science numpy pandas matplotlib",
    "stock market trading investment portfolio risk management",
    "climate change global warming carbon emissions renewable energy",
    "machine learning algorithms random forest gradient boosting",
    "python web development flask django rest api",
    "investment strategy hedge fund returns portfolio optimization",
    "solar wind energy renewable green sustainability climate",
    "deep learning computer vision image classification convolutional",
    "data analysis pandas visualization matplotlib seaborn statistics",
]

# Fit LDA
vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words="english")
X = vectorizer.fit_transform(documents)
vocab = vectorizer.get_feature_names_out()

lda = LatentDirichletAllocation(n_components=3, random_state=42, max_iter=20)
lda.fit(X)

# Display top words per topic
print("Discovered Topics:")
for topic_id, topic in enumerate(lda.components_):
    top_words = [vocab[i] for i in topic.argsort()[:-8:-1]]
    print(f"  Topic {topic_id+1}: {', '.join(top_words)}")

# Document-topic distribution
doc_topics = lda.transform(X)
for i, doc in enumerate(documents[:3]):
    dominant = doc_topics[i].argmax() + 1
    print(f"\nDoc {i+1}: Topic {dominant} dominant ({doc_topics[i].max():.3f})")
    print(f"  '{doc[:50]}'")

Topic Coherence & NMF

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Non-negative Matrix Factorization (often better coherence than LDA)
news_snippets = [
    "president signed new economic policy tax reform bill congress",
    "federal reserve interest rates inflation monetary policy",
    "championship game football team playoffs season victory",
    "basketball nba draft player trade contract signed",
    "covid vaccine efficacy clinical trial approval fda",
    "hospital treatment patient therapy drug clinical",
    "election campaign voter poll candidate debate",
    "tech company IPO stock shares market valuation",
    "championship trophy league season playoffs basketball",
    "interest rate hike federal bank economic growth",
]

vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf = vectorizer.fit_transform(news_snippets)
vocab = vectorizer.get_feature_names_out()

# NMF topic modeling
nmf = NMF(n_components=4, random_state=42)
W = nmf.fit_transform(tfidf)  # document-topic
H = nmf.components_           # topic-word

print("NMF Topics (typically more coherent):")
topic_names = ["Politics", "Economy", "Sports", "Health"]
for i, (row, name) in enumerate(zip(H, topic_names)):
    top_words = [vocab[j] for j in row.argsort()[:-6:-1]]
    print(f"  Topic {i+1} ({name}): {', '.join(top_words)}")

# Dominant topic per document
for i, doc in enumerate(news_snippets[:4]):
    dominant = W[i].argmax()
    print(f"\nDoc: '{doc[:45]}...'")
    print(f"  Dominant topic: {topic_names[dominant]} ({W[i].max():.3f})")

💼 Real-World Scenario

Your news aggregation platform has 100,000 articles with no labels. You need to automatically discover and tag content themes to power category browsing.

Real-World Code

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

articles = [
    "SpaceX launched new rocket to international space station moon mission",
    "NASA astronauts complete spacewalk satellite deployment orbital",
    "Python programming language machine learning framework scikit-learn",
    "JavaScript web development React frontend component library",
    "Olympic gold medal swimming athletics world record champion",
    "NBA basketball playoffs championship team finals victory",
    "AI language model chatbot GPT transformer neural network",
    "deep learning computer vision object detection classification",
    "marathon runner athletics world record broken championship",
    "rocket launch satellite orbit space exploration mission",
    "JavaScript TypeScript web app development framework React",
    "NBA finals championship basketball playoffs season",
]

# Compare LDA vs NMF
count_vec = CountVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf_vec = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")

X_count = count_vec.fit_transform(articles)
X_tfidf = tfidf_vec.fit_transform(articles)

n_topics = 3
models = {
    "LDA": (LatentDirichletAllocation(n_components=n_topics, random_state=42), count_vec),
    "NMF": (NMF(n_components=n_topics, random_state=42), tfidf_vec),
}

for model_name, (model, vec) in models.items():
    X = vec.transform(articles)
    W = model.fit_transform(X)
    vocab = vec.get_feature_names_out()
    H = model.components_

    print(f"\n{model_name} Topics:")
    for i, row in enumerate(H):
        top_words = [vocab[j] for j in row.argsort()[:-5:-1]]
        print(f"  Topic {i+1}: {', '.join(top_words)}")

    # Assign articles to topics
    assignments = W.argmax(axis=1)
    for topic in range(n_topics):
        docs = [articles[j][:40] for j in range(len(articles)) if assignments[j] == topic]
        print(f"  Topic {topic+1} docs: {docs}")

🏋️ Practice: Discover topics in a corpus

Apply LDA with CountVectorizer to find 3 topics and print the top 5 words per topic.

Starter Code

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

docs = ["cats dogs pets animals", "python code programming", "market stocks trading",
        "dog cat pet animal friend", "code software developer", "stock market price"]
vec = CountVectorizer(stop_words="english")
X = vec.fit_transform(docs)
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)
vocab = vec.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    print(f"Topic {i+1}:", [vocab[j] for j in topic.argsort()[:-4:-1]])

21. Text Summarization▼

Text summarization condenses long documents into shorter summaries, using extractive (key sentence selection) or abstractive (generation) approaches.

Extractive Summarization (TF-IDF)

import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def extractive_summarize(text, n_sentences=3):
    # Split into sentences
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
    if len(sentences) <= n_sentences:
        return text

    # Score sentences by TF-IDF importance
    vectorizer = TfidfVectorizer(stop_words="english")
    try:
        tfidf = vectorizer.fit_transform(sentences)
        scores = np.array(tfidf.sum(axis=1)).flatten()
    except ValueError:
        return " ".join(sentences[:n_sentences])

    # Select top sentences in original order
    top_idx = sorted(np.argsort(scores)[-n_sentences:])
    return " ".join(sentences[i] for i in top_idx)

article = (
    "Machine learning is a subset of artificial intelligence that gives systems the ability "
    "to automatically learn and improve from experience. "
    "It focuses on developing computer programs that can access data and use it to learn for themselves. "
    "Deep learning is part of machine learning based on artificial neural networks. "
    "These networks have multiple layers and can learn representations of data with multiple levels of abstraction. "
    "Natural language processing enables computers to understand human language. "
    "Applications include chatbots, translation, and sentiment analysis. "
    "Computer vision allows machines to interpret and understand visual information. "
    "This includes image classification, object detection, and facial recognition. "
    "Reinforcement learning trains agents through reward and penalty signals."
)

summary = extractive_summarize(article, n_sentences=3)
original_words = len(article.split())
summary_words = len(summary.split())
print(f"Original: {original_words} words")
print(f"Summary:  {summary_words} words ({summary_words/original_words*100:.0f}% compression)")
print(f"\nSummary:\n{summary}")

TextRank Algorithm

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def textrank_summarize(text, n_sentences=3, damping=0.85, iterations=50):
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
    if len(sentences) <= n_sentences:
        return " ".join(sentences)

    # Build similarity matrix
    vectorizer = TfidfVectorizer(stop_words="english")
    try:
        tfidf = vectorizer.fit_transform(sentences)
        sim_matrix = cosine_similarity(tfidf)
    except ValueError:
        return " ".join(sentences[:n_sentences])

    np.fill_diagonal(sim_matrix, 0)

    # Row-normalize
    row_sums = sim_matrix.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    sim_matrix = sim_matrix / row_sums

    # Power iteration (PageRank-style)
    n = len(sentences)
    scores = np.ones(n) / n
    for _ in range(iterations):
        scores = (1 - damping) / n + damping * sim_matrix.T @ scores

    top_idx = sorted(np.argsort(scores)[-n_sentences:])
    return " ".join(sentences[i] for i in top_idx)

text = (
    "Python is a high-level programming language known for its simplicity. "
    "It supports multiple programming paradigms including procedural, object-oriented, and functional. "
    "Python is widely used in data science, machine learning, and web development. "
    "The language has a rich ecosystem of libraries like NumPy, Pandas, and TensorFlow. "
    "Its syntax is designed to be readable and concise, making it beginner-friendly. "
    "Python runs on all major platforms and has an active open-source community."
)

for n in [2, 3]:
    summary = textrank_summarize(text, n_sentences=n)
    ratio = len(summary.split()) / len(text.split()) * 100
    print(f"TextRank ({n} sentences, {ratio:.0f}% of original):")
    print(f"  {summary}\n")

💼 Real-World Scenario

Legal documents average 50 pages. Your system needs to auto-generate executive summaries for lawyers to review before the full read, cutting review time by 80%.

Real-World Code

import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def hybrid_summarize(text, ratio=0.3):
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
    n = max(1, int(len(sentences) * ratio))

    vec = TfidfVectorizer(stop_words="english")
    try:
        X = vec.fit_transform(sentences)
    except ValueError:
        return sentences[0] if sentences else ""

    # TF-IDF sentence scores
    tfidf_scores = np.array(X.sum(axis=1)).flatten()

    # Position scores (penalize later sentences slightly)
    pos_scores = np.linspace(1.0, 0.5, len(sentences))

    # Diversity: penalize similar sentences
    sim = cosine_similarity(X)
    diversity_scores = np.ones(len(sentences))
    selected = []

    final_scores = tfidf_scores * pos_scores
    ranked = np.argsort(final_scores)[::-1]

    for idx in ranked:
        if len(selected) >= n:
            break
        # Check diversity
        if not selected or all(sim[idx][s] < 0.7 for s in selected):
            selected.append(idx)

    selected_sorted = sorted(selected)
    summary = " ".join(sentences[i] for i in selected_sorted)
    return summary

# Simulate a legal document excerpt
legal_text = (
    "This agreement is entered into between Party A and Party B on the date first written above. "
    "Party A agrees to provide software development services as described in Schedule A. "
    "Party B agrees to pay the fees outlined in Schedule B within 30 days of invoice. "
    "All intellectual property developed under this agreement shall belong to Party B. "
    "Party A warrants that services will be performed in a professional and workmanlike manner. "
    "This agreement shall be governed by the laws of the State of California. "
    "Either party may terminate this agreement with 30 days written notice. "
    "Confidentiality obligations shall survive termination for a period of 2 years. "
    "Any disputes shall be resolved through binding arbitration in San Francisco. "
    "This agreement constitutes the entire understanding between the parties."
)

summary = hybrid_summarize(legal_text, ratio=0.4)
print(f"Original: {len(legal_text.split())} words, {len(legal_text.split('. '))} sentences")
print(f"Summary:  {len(summary.split())} words")
print(f"\n{summary}")

🏋️ Practice: Implement extractive summarization

Write a function that scores sentences by word frequency and returns the top N sentences.

Starter Code

import re
from collections import Counter

def summarize(text, n=2):
    sents = re.split(r"[.!?]+", text)
    sents = [s.strip() for s in sents if s.strip()]
    words = Counter(text.lower().split())
    scores = [sum(words[w.lower()] for w in s.split()) for s in sents]
    top = sorted(range(len(sents)), key=lambda i: scores[i], reverse=True)[:n]
    return ". ".join(sents[i] for i in sorted(top))

text = "Python is great. It is used in AI. AI is transforming industry. Python is simple."
print(summarize(text, 2))

22. Question Answering▼

QA systems find answers to questions within a context passage using span extraction, retrieval-augmented generation (RAG), or knowledge bases.

Extractive QA with TF-IDF

import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class ExtractiveQA:
    def __init__(self, passage_size=2):
        self.passage_size = passage_size
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
        self.passages = []
        self.passage_matrix = None

    def index(self, text):
        # Split into overlapping passages
        sentences = re.split(r"(?<=[.!?])\s+", text.strip())
        passages = []
        for i in range(0, len(sentences), self.passage_size):
            chunk = " ".join(sentences[i:i+self.passage_size])
            if chunk.strip():
                passages.append(chunk)
        self.passages = passages
        self.passage_matrix = self.vectorizer.fit_transform(passages)

    def answer(self, question, top_k=1):
        q_vec = self.vectorizer.transform([question])
        scores = cosine_similarity(q_vec, self.passage_matrix)[0]
        top_idx = np.argsort(scores)[::-1][:top_k]
        return [(self.passages[i], round(float(scores[i]), 4)) for i in top_idx]

context = (
    "Python was created by Guido van Rossum in 1991. "
    "The language emphasizes code readability and simplicity. "
    "Python 3.0 was released in 2008 with major changes from Python 2. "
    "NumPy was created by Travis Oliphant in 2005. "
    "Pandas was developed by Wes McKinney in 2008 for data manipulation. "
    "Scikit-learn was released in 2007 and provides machine learning tools. "
    "TensorFlow was developed by Google Brain team and released in 2015. "
    "PyTorch was released by Facebook AI Research in 2016."
)

qa = ExtractiveQA(passage_size=2)
qa.index(context)

questions = [
    "When was Python created?",
    "Who created pandas?",
    "When was TensorFlow released?",
]
for q in questions:
    answer, score = qa.answer(q)[0]
    print(f"Q: {q}")
    print(f"A: {answer} [score={score}]\n")

RAG-Style: Retrieve + Generate

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Simulated knowledge base (RAG knowledge store)
KB = {
    "python_history": "Python was created by Guido van Rossum, first released in 1991. Python 3 was released in 2008.",
    "ml_libraries": "Scikit-learn (2007), TensorFlow (2015), PyTorch (2016) are major ML libraries.",
    "numpy_info": "NumPy provides N-dimensional array objects. It was created by Travis Oliphant in 2005.",
    "pandas_info": "Pandas was created by Wes McKinney in 2008. It provides DataFrames for data manipulation.",
    "deep_learning": "Deep learning uses neural networks with many layers. CNNs are used for images, RNNs for sequences.",
    "transformers": "Transformers use attention mechanisms. BERT (2018) and GPT (2018) are key transformer models.",
}

# Build retrieval index
vectorizer = TfidfVectorizer(stop_words="english")
docs = list(KB.values())
keys = list(KB.keys())
index = vectorizer.fit_transform(docs)

def rag_answer(question, top_k=2):
    q_vec = vectorizer.transform([question])
    scores = cosine_similarity(q_vec, index)[0]
    top_idx = np.argsort(scores)[::-1][:top_k]

    context = " ".join(docs[i] for i in top_idx)
    retrieved_keys = [keys[i] for i in top_idx]

    # Simple extraction: find sentence most similar to question
    sents = re.split(r"(?<=[.])\s+", context)
    if not sents:
        return "No answer found."

    sent_vecs = vectorizer.transform(sents)
    sent_scores = cosine_similarity(q_vec, sent_vecs)[0]
    best_sent = sents[sent_scores.argmax()]

    return {"answer": best_sent, "sources": retrieved_keys, "context": context[:100]}

questions = ["Who created NumPy?", "What is deep learning?", "When was BERT released?"]
for q in questions:
    result = rag_answer(q)
    print(f"Q: {q}")
    print(f"  Answer: {result['answer']}")
    print(f"  Sources: {result['sources']}\n")

💼 Real-World Scenario

Your internal knowledge base has 5,000 policy documents. Employees waste hours searching for specific policy answers. Build a QA system to answer HR questions instantly.

Real-World Code

import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# HR Policy knowledge base
hr_policies = {
    "vacation": "Employees receive 15 vacation days per year. Unused days roll over up to 5 days. Request via HR portal.",
    "sick_leave": "12 sick days per year. Doctor certificate required for absences over 3 consecutive days.",
    "remote_work": "Employees may work remotely up to 3 days per week with manager approval. Core hours: 10am-3pm.",
    "expense_claims": "Submit expenses within 30 days of incurrence. Receipts required for amounts over $25.",
    "performance_review": "Annual reviews in December. Mid-year check-ins in June. Ratings: Exceeds, Meets, Below expectations.",
    "parental_leave": "16 weeks paid parental leave for primary caregivers. 4 weeks for secondary caregivers.",
    "training_budget": "Each employee receives $1,500 annual training budget. Approval from manager required.",
    "overtime": "Overtime must be pre-approved. Compensated at 1.5x rate for hours over 40/week.",
}

keys = list(hr_policies.keys())
docs = list(hr_policies.values())

vec = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
index = vec.fit_transform(docs)

def hr_qa(question, threshold=0.05):
    q_vec = vec.transform([question])
    scores = cosine_similarity(q_vec, index)[0]
    best = scores.argmax()
    if scores[best] < threshold:
        return "Policy not found. Please contact HR directly."
    policy_name = keys[best].replace("_", " ").title()
    return f"[{policy_name}] {docs[best]}"

questions = [
    "How many vacation days do I get?",
    "Can I work from home?",
    "How do I claim expenses?",
    "How much training budget do I have?",
    "What is the parental leave policy?",
]
for q in questions:
    print(f"Q: {q}")
    print(f"A: {hr_qa(q)}\n")

🏋️ Practice: Build a simple QA retriever

Index a list of passages with TF-IDF and return the most relevant passage for a question.

Starter Code

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

passages = ["Python is easy to learn.", "NumPy is for arrays.", "Pandas is for data."]
vec = TfidfVectorizer()
X = vec.fit_transform(passages)
q = vec.transform(["What is NumPy?"])
scores = cosine_similarity(q, X)[0]
print("Answer:", passages[np.argmax(scores)])

23. Text Generation & Language Models▼

Text generation produces fluent text continuations, completions, or creative content using n-gram models, Markov chains, or pretrained LLMs.

N-gram Language Model

import random
from collections import defaultdict, Counter
import re

class NgramLM:
    def __init__(self, n=2):
        self.n = n
        self.ngrams = defaultdict(Counter)

    def train(self, texts):
        for text in texts:
            tokens = text.lower().split()
            tokens = ["<s>"] * (self.n-1) + tokens + ["</s>"]
            for i in range(len(tokens) - self.n + 1):
                context = tuple(tokens[i:i+self.n-1])
                next_token = tokens[i+self.n-1]
                self.ngrams[context][next_token] += 1

    def generate(self, max_len=20, seed=None):
        if seed:
            random.seed(seed)
        context = ("<s>",) * (self.n-1)
        tokens = []
        for _ in range(max_len):
            if context not in self.ngrams:
                break
            candidates = list(self.ngrams[context].items())
            next_tok = random.choices(
                [w for w, _ in candidates],
                weights=[c for _, c in candidates]
            )[0]
            if next_tok == "</s>":
                break
            tokens.append(next_tok)
            context = context[1:] + (next_tok,)
        return " ".join(tokens)

corpus = [
    "the cat sat on the mat and the dog sat on the rug",
    "machine learning models learn from data and improve over time",
    "deep learning uses neural networks with many layers",
    "natural language processing handles text and speech data",
    "data science involves statistics machine learning and programming",
]

model = NgramLM(n=3)
model.train(corpus)
print("Generated text (trigram LM):")
for i in range(4):
    print(f"  {i+1}: {model.generate(max_len=12, seed=i)}")

Markov Chain Text Generator

import random
from collections import defaultdict
import re

class MarkovChain:
    def __init__(self, order=2):
        self.order = order
        self.chain = defaultdict(list)
        self.starts = []

    def train(self, text):
        words = re.findall(r"\w+[.,!?]?", text.lower())
        if len(words) < self.order + 1:
            return
        self.starts.append(tuple(words[:self.order]))
        for i in range(len(words) - self.order):
            key = tuple(words[i:i+self.order])
            self.chain[key].append(words[i+self.order])

    def generate(self, n_words=30, seed=42):
        random.seed(seed)
        if not self.starts:
            return ""
        state = random.choice(self.starts)
        result = list(state)
        for _ in range(n_words - self.order):
            if state not in self.chain:
                break
            next_word = random.choice(self.chain[state])
            result.append(next_word)
            state = tuple(result[-self.order:])
        return " ".join(result).capitalize()

mc = MarkovChain(order=2)
training_data = [
    "Data science combines statistics, machine learning, and domain expertise to extract insights from data.",
    "Machine learning models learn patterns from training data and generalize to new examples.",
    "Deep learning architectures with many layers can learn hierarchical representations.",
    "Natural language processing techniques enable machines to understand and generate human language.",
]
for text in training_data:
    mc.train(text)

print("Markov Chain Generated Text:")
for seed in [1, 2, 3]:
    print(f"  Seed {seed}: {mc.generate(n_words=20, seed=seed)}")

💼 Real-World Scenario

Your game studio needs procedurally generated quest descriptions and item names. You need a text generator that produces diverse, thematic text without a large model.

Real-World Code

import random
from collections import defaultdict
import re

# Template-based + Markov hybrid generator for game content
class GameTextGenerator:
    def __init__(self, order=2):
        self.order = order
        self.chain = defaultdict(list)
        self.starts = []
        self.templates = {
            "quest": [
                "Retrieve the {item} from {location} and return to {npc}.",
                "Defeat the {enemy} that threatens {location}.",
                "Escort {npc} safely through {location} to {destination}.",
                "Discover the secrets of {location} by finding {item}.",
            ],
            "item": [
                "Ancient {adj} {noun} of {attribute}",
                "{adj} {noun} Forged in {location}",
                "The {npc}'s Sacred {noun}",
            ]
        }
        self.vocab = {
            "item": ["Sword", "Amulet", "Tome", "Crystal", "Shield", "Ring"],
            "location": ["Dark Forest", "Mountain Peak", "Sunken Temple", "Iron Citadel"],
            "npc": ["Elder Mage", "Village Chief", "Wandering Merchant", "Oracle"],
            "enemy": ["Shadow Drake", "Corrupted Knight", "Ancient Golem", "Bandit Lord"],
            "adj": ["Cursed", "Sacred", "Ancient", "Enchanted", "Forgotten"],
            "noun": ["Blade", "Tome", "Relic", "Seal", "Chalice"],
            "attribute": ["Fire", "Ice", "Lightning", "Void", "Light"],
            "destination": ["Capital City", "Hidden Sanctuary", "Mountain Fortress"],
        }

    def generate_from_template(self, template_type, seed=None):
        if seed is not None:
            random.seed(seed)
        template = random.choice(self.templates[template_type])
        result = template
        for key, options in self.vocab.items():
            placeholder = "{" + key + "}"
            if placeholder in result:
                result = result.replace(placeholder, random.choice(options))
        return result

random.seed(42)
gen = GameTextGenerator()
print("Generated Quests:")
for i in range(4):
    print(f"  Quest {i+1}: {gen.generate_from_template('quest', seed=i)}")
print("\nGenerated Items:")
for i in range(4):
    print(f"  Item {i+1}: {gen.generate_from_template('item', seed=i+10)}")

🏋️ Practice: Build a Markov chain text generator

Train a bigram Markov chain on sample text and generate 3 different sentences.

Starter Code

import random
from collections import defaultdict

chain = defaultdict(list)
text = "the cat sat on the mat the cat ate the rat the rat ran away"
words = text.split()
for i in range(len(words)-1):
    chain[words[i]].append(words[i+1])

random.seed(42)
word = "the"
result = [word]
for _ in range(10):
    if word not in chain: break
    word = random.choice(chain[word])
    result.append(word)
print(" ".join(result))

24. NLP Pipeline & Production Deployment▼

A production NLP pipeline integrates preprocessing, vectorization, modeling, and post-processing into a reliable, scalable system.

End-to-End NLP Pipeline

import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, lowercase=True, remove_punct=True, remove_numbers=False):
        self.lowercase = lowercase
        self.remove_punct = remove_punct
        self.remove_numbers = remove_numbers

    def preprocess(self, text):
        if self.lowercase:
            text = text.lower()
        if self.remove_punct:
            text = re.sub(r"[^\w\s]", " ", text)
        if self.remove_numbers:
            text = re.sub(r"\d+", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    def fit(self, X, y=None): return self
    def transform(self, X): return [self.preprocess(t) for t in X]

# Sample dataset
texts = [
    "Great product, fast shipping, very satisfied!",
    "Terrible quality, broke after one week.",
    "Average item, nothing special.",
    "Excellent! Exceeded all expectations!",
    "Disappointed with the purchase.",
    "Does the job, no complaints.",
    "Best purchase I have made this year!",
    "Waste of money, poor customer service.",
]
labels = ["pos", "neg", "neu", "pos", "neg", "neu", "pos", "neg"]

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

pipeline = Pipeline([
    ("preprocessor", TextPreprocessor(lowercase=True, remove_punct=True)),
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
    ("classifier", LogisticRegression(C=1.0, max_iter=1000, random_state=42)),
])

pipeline.fit(X_train, y_train)
print("Test accuracy:", round(pipeline.score(X_test, y_test), 4))
print(classification_report(y_test, pipeline.predict(X_test), zero_division=0))

# New predictions
new_texts = ["Amazing value!", "Completely broken on arrival.", "It works."]
for text, pred in zip(new_texts, pipeline.predict(new_texts)):
    print(f"  [{pred}] {text}")

NLP Monitoring & Input Validation

import re
import numpy as np
from collections import deque
from datetime import datetime

class NLPProductionSystem:
    def __init__(self, model, vectorizer):
        self.model = model
        self.vectorizer = vectorizer
        self.request_log = deque(maxlen=1000)
        self.prediction_counts = {}
        self.error_rate = 0.0

    def validate_input(self, text):
        if not isinstance(text, str):
            raise ValueError("Input must be a string")
        if len(text.strip()) < 3:
            raise ValueError("Input too short (minimum 3 chars)")
        if len(text) > 10000:
            raise ValueError("Input too long (maximum 10000 chars)")
        # Check for injection-like patterns
        if re.search(r"[<>{}|\\]", text):
            text = re.sub(r"[<>{}|\\]", " ", text)
        return text.strip()

    def predict(self, text):
        ts = datetime.now().isoformat()
        try:
            clean_text = self.validate_input(text)
            # Simulate prediction
            features = self.vectorizer.transform([clean_text])
            pred = self.model.predict(features)[0]
            prob = self.model.predict_proba(features).max()

            # Log request
            log_entry = {"ts": ts, "text_len": len(clean_text), "pred": pred, "prob": round(float(prob), 4)}
            self.request_log.append(log_entry)

            # Track prediction distribution
            self.prediction_counts[pred] = self.prediction_counts.get(pred, 0) + 1

            return {"prediction": pred, "confidence": round(float(prob), 4), "status": "ok"}

        except Exception as e:
            self.error_rate = (self.error_rate * len(self.request_log) + 1) / (len(self.request_log) + 1)
            return {"error": str(e), "status": "error"}

    def health_report(self):
        total = len(self.request_log)
        return {
            "total_requests": total,
            "prediction_distribution": self.prediction_counts,
            "error_rate": round(self.error_rate, 4),
            "avg_text_length": round(np.mean([r["text_len"] for r in self.request_log]) if self.request_log else 0, 1)
        }

# Setup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vec = TfidfVectorizer(ngram_range=(1,2))
texts = ["great product", "terrible quality", "okay item", "excellent service",
         "bad experience", "good value", "poor quality", "fantastic result"]
labels = ["pos","neg","neu","pos","neg","pos","neg","pos"]
vec.fit(texts)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(vec.transform(texts), labels)

system = NLPProductionSystem(model, vec)

test_inputs = ["Amazing product!", "", "a", "Works great!", "Very poor quality...", "It is fine I guess"]
for text in test_inputs:
    result = system.predict(text)
    print(f"  Input: {repr(text):<40} -> {result}")

print("\nHealth Report:", system.health_report())

💼 Real-World Scenario

Your company needs a production NLP API serving 50,000 requests/day for customer intent classification. It must handle malformed inputs, log predictions, and detect when the model is underperforming.

Real-World Code

import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import json

# Intent classification dataset
intents = [
    ("I want to buy a new laptop", "purchase"),
    ("Can I get a refund for my order?", "refund"),
    ("How do I track my package?", "tracking"),
    ("I want to cancel my subscription", "cancel"),
    ("What is your return policy?", "policy"),
    ("Add item to my shopping cart", "purchase"),
    ("My order has not arrived yet", "tracking"),
    ("I would like my money back", "refund"),
    ("Stop my monthly plan", "cancel"),
    ("What are your shipping rules?", "policy"),
    ("Buy now and save 20%", "purchase"),
    ("Request a full refund please", "refund"),
    ("Where is my delivery?", "tracking"),
    ("I want to end my account", "cancel"),
    ("Tell me about your privacy policy", "policy"),
]

texts, labels = zip(*intents)

# Production pipeline
clf = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, max_features=5000)),
    ("lr", LogisticRegression(C=1.0, max_iter=1000, random_state=42)),
])

# Cross-validate (small dataset, so use all data for demo)
clf.fit(texts, labels)

# Test on new inputs
test_inputs = [
    "I need to return this product",
    "Where is my shipment?",
    "Cancel my account immediately",
    "I want to purchase this item",
    "What are the terms of service?",
]

print("Intent Classification Results:")
for text in test_inputs:
    pred = clf.predict([text])[0]
    probs = clf.predict_proba([text])[0]
    classes = clf.classes_
    confidence = max(probs)
    print(f"  [{pred:<10}] ({confidence:.3f}) {text}")

# Prediction audit log
audit = []
for text in test_inputs:
    pred = clf.predict([text])[0]
    conf = float(max(clf.predict_proba([text])[0]))
    needs_review = conf < 0.7
    audit.append({"text": text[:30], "intent": pred, "confidence": round(conf, 3), "review": needs_review})

print("\nAudit Log:")
print(json.dumps(audit, indent=2))

🏋️ Practice: Build a text classification pipeline

Create an sklearn Pipeline with TfidfVectorizer + LogisticRegression, train it on intent examples, and classify new inputs.

Starter Code

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

train = [("buy now", "purchase"), ("cancel plan", "cancel"), ("track order", "tracking")]
texts, labels = zip(*train)
clf = Pipeline([("tfidf", TfidfVectorizer()), ("lr", LogisticRegression(max_iter=100))])
clf.fit(texts, labels)
print(clf.predict(["I want to stop my subscription"]))