🧠 NLP & Text Processing
24 topics • Click any card to expand
Raw text is noisy. Learn to normalize case, remove punctuation, strip HTML, handle Unicode, and build reusable cleaning pipelines.
import re, string
text = ' Hello, World! This is NLP 101... Check <b>this</b> out! '
# Lowercase
clean = text.lower()
# Strip HTML tags
clean = re.sub(r'<[^>]+>', '', clean)
# Remove punctuation
clean = clean.translate(str.maketrans('', '', string.punctuation))
# Collapse whitespace
clean = ' '.join(clean.split())
print(repr(clean))
# Output: 'hello world this is nlp 101 check this out'import re
def clean_text(text):
text = re.sub(r'http\S+|www\.\S+', '', text) # URLs
text = re.sub(r'@\w+', '', text) # @mentions
text = re.sub(r'#\w+', '', text) # hashtags
text = re.sub(r'[^\w\s]', '', text) # punctuation
text = re.sub(r'\d+', '', text) # digits
text = re.sub(r'\s+', ' ', text).strip() # whitespace
return text.lower()
tweet = 'Check out https://example.com #NLP @user! 123 Great stuff!!!'
print(clean_text(tweet))
# Output: 'check out great stuff'import unicodedata
text = 'Café naïve résumé — quotes \u2018smart\u2019'
# Normalize to ASCII (strip accents)
def to_ascii(text):
nfkd = unicodedata.normalize('NFKD', text)
return ''.join(c for c in nfkd if not unicodedata.combining(c))
# Normalize smart quotes to straight
def fix_quotes(text):
replacements = {\u2018: "'", \u2019: "'", \u201c: '"', \u201d: '"',
\u2013: '-', \u2014: '-'}
return ''.join(replacements.get(c, c) for c in text)
print(to_ascii(text))
print(fix_quotes(text))import re, string
from typing import List, Callable
def make_pipeline(*fns: Callable) -> Callable:
def pipeline(text: str) -> str:
for fn in fns:
text = fn(text)
return text
return pipeline
lowercase = str.lower
remove_urls = lambda t: re.sub(r'http\S+', '', t)
remove_punct = lambda t: t.translate(str.maketrans('', '', string.punctuation))
collapse_spaces = lambda t: ' '.join(t.split())
clean = make_pipeline(lowercase, remove_urls, remove_punct, collapse_spaces)
texts = ['Visit https://ai.com for more!', 'Hello, World!!', ' PYTHON NLP ']
print([clean(t) for t in texts])import re, string
def preprocess_ticket(text: str) -> str:
text = re.sub(r'<[^>]+>', ' ', text) # strip HTML
text = re.sub(r'http\S+', '', text) # remove URLs
text = text.encode('ascii', 'ignore').decode() # strip emoji/unicode
text = text.lower()
text = re.sub(r'[^\w\s]', ' ', text) # punctuation -> space
text = ' '.join(text.split()) # normalize whitespace
return text
tickets = [
'<p>My <b>order</b> #12345 is LATE! See https://track.com/12345</p>',
'App crashed 😤 after update v2.1 — please fix ASAP!!!',
]
for t in tickets:
print(preprocess_ticket(t))import re
email = '''
From: alice@example.com
To: bob@example.com
Subject: Project Update
Hi Bob,
> Thanks for the report
> it was helpful
Looks great! Let's sync tomorrow.
'''
def clean_email(text: str) -> str:
# TODO: Remove header lines (From:, To:, Subject:)
# TODO: Remove quoted lines (starting with >)
# TODO: Collapse blank lines
pass
print(clean_email(email))Tokenization splits text into meaningful units. Stopword removal filters common words that carry little semantic weight.
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import word_tokenize, sent_tokenize
text = 'Dr. Smith said NLP is fun. It really is! Don\'t you think?'
sentences = sent_tokenize(text)
print('Sentences:', sentences)
words = word_tokenize(text)
print('Words:', words)import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
text = 'The quick brown fox jumps over the lazy dog'
tokens = word_tokenize(text.lower())
filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
print('Original tokens:', tokens)
print('Filtered:', filtered)try:
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Apple is looking at buying U.K. startup for $1 billion.'
doc = nlp(text)
tokens = [(t.text, t.pos_, t.is_stop) for t in doc]
print('(token, POS, is_stop):')
for tok in tokens:
print(tok)
except OSError:
print('Run: python -m spacy download en_core_web_sm')try:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
text = 'Tokenization handles out-of-vocabulary words cleverly.'
tokens = tokenizer.tokenize(text)
ids = tokenizer.encode(text)
print('Subword tokens:', tokens)
print('Token IDs:', ids)
print('Decoded:', tokenizer.decode(ids))
except ImportError:
print('pip install transformers')import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
STOP = set(stopwords.words('english'))
def extract_keywords(review: str, top_n: int = 5):
tokens = word_tokenize(review.lower())
keywords = [w for w in tokens if w.isalpha() and w not in STOP and len(w) > 2]
return Counter(keywords).most_common(top_n)
review = 'The battery life is amazing. This phone has the best battery I have ever used. Great camera too.'
print(extract_keywords(review))import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
DOMAIN_STOPS = {'customer', 'product', 'order', 'item', 'purchase'}
def filter_tokens(text: str) -> list:
# TODO: combine NLTK stopwords + DOMAIN_STOPS
# TODO: tokenize, lowercase, filter
pass
reviews = [
'Customer service was excellent, product quality amazing',
'Order arrived late but item was in perfect condition',
]
for r in reviews:
print(filter_tokens(r))Reduce words to their base forms. Stemming is fast but crude (running→run). Lemmatization is linguistically accurate (better→good).
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer
porter = PorterStemmer()
snowball = SnowballStemmer('english')
words = ['running', 'flies', 'happily', 'studies', 'beautiful', 'caring']
print(f'{'Word':<15} {'Porter':<15} {'Snowball':<15}')
for w in words:
print(f'{w:<15} {porter.stem(w):<15} {snowball.stem(w):<15}')import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
# POS matters: 'better' as ADJ -> 'good', as VERB -> 'better'
examples = [
('better', 'a'), # adjective
('running', 'v'), # verb
('geese', 'n'), # noun
('happily', 'r'), # adverb
]
for word, pos in examples:
lem = lemmatizer.lemmatize(word, pos=pos)
print(f'{word} ({pos}) -> {lem}')try:
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'The children were running and the geese were flying'
doc = nlp(text)
print(f'{'Token':<15} {'Lemma':<15} {'POS':<10}')
for token in doc:
print(f'{token.text:<15} {token.lemma_:<15} {token.pos_:<10}')
except OSError:
print('Run: python -m spacy download en_core_web_sm')import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.stem import PorterStemmer, WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
words = ['studies', 'studying', 'better', 'wolves', 'corpora', 'matrices']
print(f'{'Word':<12} {'Stem':<12} {'Lemma (n)':<12}')
for w in words:
stem = stemmer.stem(w)
lemma = lemmatizer.lemmatize(w, pos='n')
print(f'{w:<12} {stem:<12} {lemma:<12}')import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
lem = WordNetLemmatizer()
STOP = set(stopwords.words('english'))
def normalize_skills(text: str) -> set:
tokens = word_tokenize(text.lower())
return {lem.lemmatize(w, 'v') for w in tokens if w.isalpha() and w not in STOP}
job_req = 'Managed budgets, leading teams, developed strategies'
resume = 'manages budgets, leads cross-functional teams, develop product strategy'
job_kw = normalize_skills(job_req)
res_kw = normalize_skills(resume)
overlap = job_kw & res_kw
print('Match score:', len(overlap) / len(job_kw))
print('Matched keywords:', overlap)import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lem = WordNetLemmatizer()
def normalize_query(query: str) -> list:
# TODO: tokenize, lemmatize as both noun and verb, deduplicate
pass
queries = ['running shoes', 'buying products', 'managed accounts']
for q in queries:
print(q, '->', normalize_query(q))NER identifies and classifies named entities (persons, organizations, locations, dates) in text — essential for information extraction.
try:
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Apple Inc. was founded by Steve Jobs in Cupertino on April 1, 1976.'
doc = nlp(text)
print('Entities found:')
for ent in doc.ents:
print(f' {ent.text:<25} {ent.label_:<12} {spacy.explain(ent.label_)}')
except OSError:
print('Run: python -m spacy download en_core_web_sm')import nltk
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree
text = 'Barack Obama served as the 44th President of the United States.'
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
chunks = ne_chunk(tagged)
for subtree in chunks:
if isinstance(subtree, Tree):
entity = ' '.join(word for word, tag in subtree.leaves())
print(f'{entity}: {subtree.label()}')try:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
text = 'Elon Musk founded SpaceX in Hawthorne, California in 2002.'
doc = nlp(text)
# In a Jupyter notebook this renders inline:
# displacy.render(doc, style='ent')
# Save to HTML:
html = displacy.render(doc, style='ent', page=True)
print(html[:200], '...') # Show snippet
print('\nEntities:', [(e.text, e.label_) for e in doc.ents])
except OSError:
print('Run: python -m spacy download en_core_web_sm')try:
import spacy
nlp = spacy.load('en_core_web_sm')
# Add custom entity patterns
ruler = nlp.add_pipe('entity_ruler', before='ner')
patterns = [
{'label': 'TECH_STACK', 'pattern': 'Python'},
{'label': 'TECH_STACK', 'pattern': 'TensorFlow'},
{'label': 'TECH_STACK', 'pattern': [{'LOWER': 'scikit'}, {'LOWER': '-'}, {'LOWER': 'learn'}]},
]
ruler.add_patterns(patterns)
doc = nlp('We use Python and TensorFlow with scikit-learn for ML.')
for ent in doc.ents:
print(f'{ent.text}: {ent.label_}')
except OSError:
print('Run: python -m spacy download en_core_web_sm')try:
import spacy
from collections import defaultdict
nlp = spacy.load('en_core_web_sm')
articles = [
'Tesla CEO Elon Musk announced record deliveries in Q4 2024.',
'Microsoft acquired Activision Blizzard for $68.7 billion.',
'Warren Buffett increased Berkshire Hathaway stake in Apple.',
]
entity_index = defaultdict(list)
for i, art in enumerate(articles):
doc = nlp(art)
for ent in doc.ents:
if ent.label_ in ('ORG', 'PERSON', 'MONEY', 'DATE'):
entity_index[ent.text].append(i)
for entity, article_ids in entity_index.items():
print(f'{entity}: articles {article_ids}')
except OSError:
print('Run: python -m spacy download en_core_web_sm')headlines = [
'Google CEO Sundar Pichai unveils new AI products at Google I/O',
'Apple and Google partner on health data standards',
'Jeff Bezos steps down as Amazon CEO',
'Amazon reports record profits under Andy Jassy',
'Sundar Pichai defends Google search monopoly in court',
]
def top_entities(texts, n=5):
# TODO: load spacy, extract PERSON + ORG entities
# TODO: count frequencies, return top n
pass
print(top_entities(headlines))Determine whether text expresses positive, negative, or neutral sentiment. Learn rule-based (VADER), ML-based, and transformer approaches.
import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
texts = [
'The food was absolutely amazing and the service was great!',
'Terrible experience. Never going back.',
'The product arrived on time.',
'Not bad, but could be better.',
]
for text in texts:
scores = sia.polarity_scores(text)
label = 'POSITIVE' if scores['compound'] > 0.05 else 'NEGATIVE' if scores['compound'] < -0.05 else 'NEUTRAL'
print(f'{label}: {text[:40]:<40} | compound={scores["compound"]:.3f}')try:
from textblob import TextBlob
reviews = [
'Absolutely love this product! Best purchase ever.',
'Disappointed. Quality is poor and shipping was slow.',
'It is okay. Nothing special.',
]
for review in reviews:
blob = TextBlob(review)
pol = blob.sentiment.polarity # -1 to 1
sub = blob.sentiment.subjectivity # 0 (objective) to 1 (subjective)
print(f'Polarity: {pol:+.2f} Subjectivity: {sub:.2f} | {review[:40]}')
except ImportError:
print('pip install textblob')try:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis',
model='distilbert-base-uncased-finetuned-sst-2-english')
texts = [
'I love this movie so much!',
'This is the worst product I have ever bought.',
'The package arrived in reasonable time.',
]
results = sentiment(texts)
for text, result in zip(texts, results):
print(f'{result["label"]:<10} ({result["score"]:.3f}): {text}')
except ImportError:
print('pip install transformers torch')import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
ASPECTS = {
'battery': ['battery', 'charge', 'charging', 'power'],
'camera': ['camera', 'photo', 'picture', 'image'],
'screen': ['screen', 'display', 'resolution'],
}
def aspect_sentiment(review):
results = {}
sentences = nltk.sent_tokenize(review)
for aspect, keywords in ASPECTS.items():
relevant = [s for s in sentences if any(k in s.lower() for k in keywords)]
if relevant:
scores = [sia.polarity_scores(s)['compound'] for s in relevant]
results[aspect] = sum(scores) / len(scores)
return results
review = 'Battery life is excellent! But the camera quality is disappointing. The screen is stunning.'
print(aspect_sentiment(review))import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dataclasses import dataclass
from typing import List
@dataclass
class Review:
id: int
text: str
product_id: str
sia = SentimentIntensityAnalyzer()
def triage_reviews(reviews: List[Review]):
negative = []
for r in reviews:
score = sia.polarity_scores(r.text)['compound']
if score < -0.3:
negative.append((r, score))
return sorted(negative, key=lambda x: x[1]) # worst first
reviews = [
Review(1, 'Love it! Works perfectly.', 'P001'),
Review(2, 'Completely broken. Total waste of money.', 'P002'),
Review(3, 'Item never arrived. Terrible service!', 'P003'),
]
flagged = triage_reviews(reviews)
for review, score in flagged:
print(f'Review {review.id} (score={score:.3f}): {review.text}')import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
tweets = [
('2024-01-01', 'Great start to the new year! So excited!'),
('2024-01-01', 'Traffic was awful this morning.'),
('2024-01-02', 'Amazing concert last night!'),
('2024-01-02', 'Concert tickets were overpriced but show was okay'),
('2024-01-02', 'Best night ever, loved every minute!'),
]
def daily_sentiment(tweets):
sia = SentimentIntensityAnalyzer()
# TODO: group by date, average compound scores
# TODO: label each day as POSITIVE / NEGATIVE / NEUTRAL
pass
daily_sentiment(tweets)Convert text to numerical representations and measure similarity. Covers Bag-of-Words, TF-IDF, cosine similarity, and word embeddings.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
corpus = [
'the cat sat on the mat',
'the dog lay on the rug',
'cats and dogs are both great pets',
]
vec = TfidfVectorizer(stop_words='english')
X = vec.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
print(df.round(3))from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
docs = [
'Python is great for data science',
'Data science uses Python and R',
'I love cooking Italian food',
'Machine learning with Python and sklearn',
]
vec = TfidfVectorizer(stop_words='english')
X = vec.fit_transform(docs)
sim = cosine_similarity(X)
print('Similarity matrix:')
for i, row in enumerate(sim):
print(f'Doc {i}: {[f"{v:.2f}" for v in row]}')try:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import word_tokenize
sentences = [
'king is a powerful man',
'queen is a powerful woman',
'boy is a young man',
'girl is a young woman',
]
tokenized = [word_tokenize(s) for s in sentences]
model = Word2Vec(tokenized, vector_size=50, window=3, min_count=1, epochs=100)
# Classic word vector arithmetic
result = model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=3)
print('king + woman - man =', result)
print('Similarity(king, queen):', model.wv.similarity('king', 'queen'))
except ImportError:
print('pip install gensim')try:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = [
'A man is playing guitar.',
'Someone is strumming a musical instrument.',
'A cat is sitting on the couch.',
]
embeddings = model.encode(sentences)
sim = cosine_similarity(embeddings)
print('Semantic similarity:')
for i in range(len(sentences)):
for j in range(i+1, len(sentences)):
print(f' [{i}] vs [{j}]: {sim[i,j]:.3f}')
except ImportError:
print('pip install sentence-transformers')from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
clauses = [
'The contractor shall deliver all work by the agreed deadline.',
'All deliverables must be submitted by the agreed-upon deadline.',
'Payment shall be made within 30 days of invoice receipt.',
'The client agrees to pay within thirty days of receiving the invoice.',
'Confidential information must not be disclosed to third parties.',
]
vec = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X = vec.fit_transform(clauses)
sim = cosine_similarity(X)
THRESHOLD = 0.5
print('Near-duplicate pairs (similarity > 0.5):')
for i in range(len(clauses)):
for j in range(i+1, len(clauses)):
if sim[i, j] > THRESHOLD:
print(f' [{i}] & [{j}]: {sim[i,j]:.3f}')
print(f' {clauses[i][:60]}')
print(f' {clauses[j][:60]}')from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
faqs = [
('How do I reset my password?', 'Go to login page and click Forgot Password.'),
('What payment methods are accepted?', 'We accept Visa, Mastercard and PayPal.'),
('How long does shipping take?', 'Standard shipping takes 5-7 business days.'),
('Can I return an item?', 'Yes, returns are accepted within 30 days.'),
]
def find_answer(question: str) -> str:
# TODO: vectorize FAQ questions + user question
# TODO: compute cosine similarity
# TODO: return answer for most similar FAQ
pass
print(find_answer('How can I change my password?'))
print(find_answer('Do you accept credit cards?'))Discover hidden thematic structure in document collections. Learn Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF).
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
docs = [
'baseball team pitcher bat home run stadium',
'football touchdown quarterback field goal referee',
'stock market shares dividends portfolio investor',
'bitcoin ethereum blockchain cryptocurrency wallet',
'machine learning neural network deep learning AI',
'python data science pandas numpy statistics',
]
vec = CountVectorizer(stop_words='english')
X = vec.fit_transform(docs)
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)
feature_names = vec.get_feature_names_out()
for i, topic in enumerate(lda.components_):
top_words = [feature_names[j] for j in topic.argsort()[-6:][::-1]]
print(f'Topic {i}: {top_words}')from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
docs = [
'health fitness exercise gym workout training',
'diet nutrition calories protein weight loss',
'travel vacation flight hotel beach tourism',
'passport visa travel destination adventure explore',
'cooking recipe chef kitchen ingredients bake',
]
vec = TfidfVectorizer(stop_words='english', max_features=50)
X = vec.fit_transform(docs)
nmf = NMF(n_components=3, random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_
feature_names = vec.get_feature_names_out()
for i, topic in enumerate(H):
top_words = [feature_names[j] for j in topic.argsort()[-5:][::-1]]
print(f'Topic {i}: {top_words}')
print('\nDoc-topic assignments (W):')
for i, row in enumerate(W):
print(f'Doc {i}: topic {row.argmax()}')from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
# Synthetic corpus
docs = (['python data science machine learning'] * 5 +
['football soccer game stadium team'] * 5 +
['stock market finance investment portfolio'] * 5)
vec = CountVectorizer(stop_words='english')
X = vec.fit_transform(docs)
perplexities = []
k_range = range(2, 7)
for k in k_range:
lda = LatentDirichletAllocation(n_components=k, random_state=42, max_iter=20)
lda.fit(X)
perplexities.append(lda.perplexity(X))
print(f'k={k}: perplexity={lda.perplexity(X):.2f}')
best_k = k_range[np.argmin(perplexities)]
print(f'Best k: {best_k}')from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
train_docs = [
'python programming code software developer',
'machine learning model training dataset',
'basketball players championship game team',
'tennis grand slam tournament court player',
]
TOPIC_LABELS = {0: 'Technology', 1: 'Sports'} # manual labels
vec = CountVectorizer(stop_words='english')
X_train = vec.fit_transform(train_docs)
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X_train)
new_docs = [
'neural network deep learning GPU training',
'football touchdown quarterback Super Bowl',
]
X_new = vec.transform(new_docs)
topic_dist = lda.transform(X_new)
for doc, dist in zip(new_docs, topic_dist):
label = TOPIC_LABELS.get(dist.argmax(), f'Topic {dist.argmax()}')
print(f'{doc[:40]} -> {label} ({dist.max():.2f})')from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
articles = [
'The government announced new climate change policy and carbon tax.',
'Scientists discover breakthrough in quantum computing research.',
'Stock markets rally as tech earnings exceed expectations.',
'Premier League clubs prepare for summer transfer window.',
'AI startup raises $500M for large language model development.',
'Central bank raises interest rates to fight inflation.',
]
TOPIC_NAMES = ['Politics/Environment', 'Technology/Science', 'Finance', 'Sports']
vec = CountVectorizer(stop_words='english', min_df=1)
X = vec.fit_transform(articles)
lda = LatentDirichletAllocation(n_components=4, random_state=42)
lda.fit(X)
topic_dist = lda.transform(X)
for article, dist in zip(articles, topic_dist):
dominant = dist.argmax()
print(f'{TOPIC_NAMES[dominant]}: {article[:55]}')from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
feedback = [
'Delivery was fast and packaging was excellent',
'Shipping took too long and package was damaged',
'Customer support was very helpful and responsive',
'Support team was rude and unhelpful',
'Product quality is amazing, well made and durable',
'The product broke after one week, poor quality',
'Price is reasonable for the quality you get',
'Very expensive for what it is, not worth the money',
]
def model_topics(docs, n_topics=3):
# TODO: CountVectorizer -> LDA -> print top words per topic
pass
model_topics(feedback)Train models to classify text into categories. Covers Naive Bayes, Logistic Regression, and transformer-based fine-tuning pipelines.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Minimal spam dataset
texts = [
'Win a FREE iPhone now! Click here!!!', 'URGENT: You have won $1000',
'Claim your prize today, limited offer!', 'Hot singles in your area',
'Meeting at 3pm in the conference room', 'Can you review my pull request?',
'Lunch tomorrow? Let me know.', 'Project deadline is Friday.',
'Budget report attached for your review', 'Hi, are you free this afternoon?',
]
labels = [1, 1, 1, 1, 0, 0, 0, 0, 0, 0] # 1=spam, 0=ham
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)
clf = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test), target_names=['ham', 'spam']))from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
texts = [
'The stock market crashed today', 'Interest rates affect mortgages',
'Champions League final tonight', 'NBA playoffs heating up',
'New deep learning model released', 'Python 4.0 features announced',
'Election results pending', 'Senate votes on new bill',
]
labels = ['finance','finance','sports','sports','tech','tech','politics','politics']
pipe = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
('lr', LogisticRegression(max_iter=200))
])
scores = cross_val_score(pipe, texts, labels, cv=2, scoring='accuracy')
print(f'CV accuracy: {scores.mean():.2f} ± {scores.std():.2f}')
pipe.fit(texts, labels)
print(pipe.predict(['Bitcoin surges to all-time high']))try:
from transformers import pipeline
classifier = pipeline('zero-shot-classification',
model='facebook/bart-large-mnli')
texts = [
'The Federal Reserve raises interest rates by 25 basis points.',
'Team wins championship after dramatic overtime goal.',
'New AI model achieves human-level performance on benchmark.',
]
candidate_labels = ['finance', 'sports', 'technology', 'politics']
for text in texts:
result = classifier(text, candidate_labels)
print(f'{result["labels"][0]:12} ({result["scores"][0]:.2f}): {text[:50]}')
except ImportError:
print('pip install transformers torch')from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np
texts = [
'stock market bull bear portfolio dividends',
'goal touchdown home run stadium championship',
'algorithm neural network training dataset model',
'vote senator election campaign policy',
] * 3
labels = ['finance','sports','tech','politics'] * 3
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression(max_iter=200))])
pipe.fit(texts, labels)
feature_names = pipe['tfidf'].get_feature_names_out()
classes = pipe['lr'].classes_
for cls, coef in zip(classes, pipe['lr'].coef_):
top = [feature_names[i] for i in coef.argsort()[-5:][::-1]]
print(f'{cls}: {top}')from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
training_data = [
('Company reports record quarterly earnings', 'finance'),
('Inflation rises as central bank meets', 'finance'),
('National team advances to World Cup final', 'sports'),
('Olympic gold medal for marathon runner', 'sports'),
('New open-source large language model released', 'tech'),
('Semiconductor company launches AI chip', 'tech'),
('Prime minister announces cabinet reshuffle', 'politics'),
('Senate approves infrastructure spending bill', 'politics'),
]
texts, labels = zip(*training_data)
router = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
('lr', LogisticRegression(max_iter=500))
])
router.fit(texts, labels)
press_releases = [
'Startup raises $200M Series C for AI research',
'Tennis star wins fourth Grand Slam title',
]
for pr in press_releases:
desk = router.predict([pr])[0]
proba = router.predict_proba([pr]).max()
print(f'{desk.upper()} ({proba:.0%}): {pr}')from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
reviews = [
('Absolutely fantastic product, highly recommend!', 'positive'),
('Life changing purchase, best I have ever made', 'positive'),
('Pretty good but not perfect', 'neutral'),
('Does the job, nothing special', 'neutral'),
('Broken on arrival, terrible quality', 'negative'),
('Complete waste of money, do not buy', 'negative'),
('Great value for money, very happy', 'positive'),
('Disappointed, expected much better', 'negative'),
]
texts, labels = zip(*reviews)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)
# TODO: build Pipeline with TfidfVectorizer + LogisticRegression
# TODO: fit, predict, print classification_reportUnderstand transformer architecture, use pre-trained BERT/GPT models for embeddings, question answering, and text generation.
try:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def get_embedding(text):
inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].squeeze() # [CLS] token
s1 = get_embedding('How do I reset my password?')
s2 = get_embedding('Steps to change account password')
s3 = get_embedding('Best pizza recipe')
print('s1 vs s2:', F.cosine_similarity(s1.unsqueeze(0), s2.unsqueeze(0)).item())
print('s1 vs s3:', F.cosine_similarity(s1.unsqueeze(0), s3.unsqueeze(0)).item())
except ImportError:
print('pip install transformers torch')try:
from transformers import pipeline
qa = pipeline('question-answering', model='distilbert-base-cased-distilled-squad')
context = '''
Python was created by Guido van Rossum and first released in 1991.
It was designed with an emphasis on code readability and simplicity.
Python supports multiple programming paradigms including procedural,
object-oriented, and functional programming.
'''
questions = [
'Who created Python?',
'When was Python first released?',
'What paradigms does Python support?',
]
for q in questions:
answer = qa({'question': q, 'context': context})
print(f'Q: {q}')
print(f'A: {answer["answer"]} (score: {answer["score"]:.3f})\n')
except ImportError:
print('pip install transformers torch')try:
from transformers import pipeline
generator = pipeline('text-generation', model='gpt2', max_new_tokens=80)
prompts = [
'The future of artificial intelligence is',
'Data science has transformed the way we',
]
for prompt in prompts:
result = generator(prompt, num_return_sequences=1, do_sample=True, temperature=0.7)
print(f'Prompt: {prompt}')
print(f'Generated: {result[0]["generated_text"]}\n')
except ImportError:
print('pip install transformers torch')try:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Example: tokenize a batch
texts = ['I love this product!', 'Terrible experience.']
labels = [1, 0]
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
# In real fine-tuning, wrap in a Dataset and use Trainer:
# training_args = TrainingArguments(output_dir='./results', num_train_epochs=3)
# trainer = Trainer(model=model, args=training_args, ...)
# trainer.train()
print('Model ready for fine-tuning')
print('Tokenized input_ids shape:', encodings['input_ids'].shape)
except ImportError:
print('pip install transformers torch datasets')try:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')
# Knowledge base documents
kb = [
{'q': 'How do I reset my password?',
'a': 'Go to Settings > Security > Reset Password and enter your email.'},
{'q': 'What is the refund policy?',
'a': 'Refunds are processed within 5-7 business days of approval.'},
{'q': 'How do I cancel my subscription?',
'a': 'Navigate to Billing > Subscription > Cancel Subscription.'},
]
kb_embeddings = model.encode([item['q'] for item in kb])
def answer_question(user_q: str):
q_emb = model.encode([user_q])
sims = cosine_similarity(q_emb, kb_embeddings)[0]
best = sims.argmax()
return kb[best]['a'], sims[best]
for question in ['Change my account password', 'Get money back for purchase']:
answer, conf = answer_question(question)
print(f'Q: {question}')
print(f'A: {answer} (confidence: {conf:.2f})\n')
except ImportError:
print('pip install sentence-transformers')try:
from transformers import pipeline
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
article = '''
Artificial intelligence has made remarkable progress over the past decade.
Large language models like GPT-4 and Claude can now generate coherent text,
answer complex questions, write code, and even reason about abstract problems.
These models are trained on vast amounts of internet text using self-supervised
learning, allowing them to develop broad world knowledge. However, challenges
remain around hallucination, bias, and alignment with human values. Researchers
continue to work on making these systems safer and more reliable.
'''
# TODO: use summarizer to generate a short summary (max_length=60)
# TODO: print original word count vs summary word count
except ImportError:
print('pip install transformers torch')Combine NLP components into production-ready pipelines. Learn batching, caching, serving NLP models via API, and evaluation metrics.
try:
import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')
def analyze_text(text: str) -> dict:
doc = nlp(text)
return {
'word_count': len([t for t in doc if not t.is_punct]),
'sentences': len(list(doc.sents)),
'entities': [(e.text, e.label_) for e in doc.ents],
'top_nouns': Counter(t.lemma_ for t in doc if t.pos_ == 'NOUN').most_common(3),
'top_verbs': Counter(t.lemma_ for t in doc if t.pos_ == 'VERB').most_common(3),
}
text = 'Elon Musk launched SpaceX rockets in 2020. Tesla reported record profits in Q4.'
result = analyze_text(text)
for k, v in result.items():
print(f'{k}: {v}')
except OSError:
print('Run: python -m spacy download en_core_web_sm')try:
import spacy
import time
nlp = spacy.load('en_core_web_sm')
texts = [f'Document {i}: Apple and Google are tech giants in Silicon Valley.' for i in range(50)]
# Sequential processing
t0 = time.time()
results_seq = [nlp(t) for t in texts]
t_seq = time.time() - t0
# Batch processing with nlp.pipe
t0 = time.time()
results_batch = list(nlp.pipe(texts, batch_size=16))
t_batch = time.time() - t0
print(f'Sequential: {t_seq:.3f}s')
print(f'Batch pipe: {t_batch:.3f}s')
print(f'Speedup: {t_seq/t_batch:.1f}x')
except OSError:
print('Run: python -m spacy download en_core_web_sm')# Run with: uvicorn app:app --reload
# pip install fastapi uvicorn
FASTAPI_APP = '''
from fastapi import FastAPI
from pydantic import BaseModel
import nltk
nltk.download("vader_lexicon", quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
app = FastAPI()
sia = SentimentIntensityAnalyzer()
class TextRequest(BaseModel):
text: str
@app.post("/sentiment")
def analyze_sentiment(req: TextRequest):
scores = sia.polarity_scores(req.text)
label = "positive" if scores["compound"] > 0.05 else "negative" if scores["compound"] < -0.05 else "neutral"
return {"label": label, "scores": scores}
@app.get("/health")
def health():
return {"status": "ok"}
'''
print(FASTAPI_APP)from sklearn.metrics import precision_recall_fscore_support, classification_report
# BIO tagging evaluation example
# True entity spans vs predicted entity spans
true_entities = [
{('Apple', 'ORG'), ('Tim Cook', 'PERSON'), ('Cupertino', 'GPE')},
{('Google', 'ORG'), ('Sundar Pichai', 'PERSON')},
]
pred_entities = [
{('Apple', 'ORG'), ('Tim Cook', 'PERSON')}, # missed Cupertino
{('Google', 'ORG'), ('Sundar Pichai', 'PERSON'), ('Mountain View', 'GPE')}, # extra FP
]
def ner_metrics(true_list, pred_list):
tp = sum(len(t & p) for t, p in zip(true_list, pred_list))
fp = sum(len(p - t) for t, p in zip(true_list, pred_list))
fn = sum(len(t - p) for t, p in zip(true_list, pred_list))
prec = tp / (tp + fp) if tp + fp > 0 else 0
rec = tp / (tp + fn) if tp + fn > 0 else 0
f1 = 2 * prec * rec / (prec + rec) if prec + rec > 0 else 0
return {'precision': prec, 'recall': rec, 'f1': f1}
print(ner_metrics(true_entities, pred_entities))try:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import json
nlp = spacy.load('en_core_web_sm')
# Train topic classifier
train_texts = [
'stock market profits earnings', 'championship game score',
'AI model research launch', 'election vote policy senator',
]
train_labels = ['finance', 'sports', 'tech', 'politics']
topic_clf = Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())])
topic_clf.fit(train_texts, train_labels)
def process_article(text: str) -> dict:
doc = nlp(text)
topic = topic_clf.predict([text])[0]
return {
'topic': topic,
'entities': [(e.text, e.label_) for e in doc.ents],
'word_count': len([t for t in doc if not t.is_punct]),
}
articles = [
'Tesla stock surges after record Q4 earnings report.',
'Manchester City wins Premier League with last-minute goal.',
]
results = list(nlp.pipe(articles)) # batch NER
for text, doc in zip(articles, results):
topic = topic_clf.predict([text])[0]
print(json.dumps({'text': text[:40], 'topic': topic,
'entities': [(e.text, e.label_) for e in doc.ents]}, indent=2))
except OSError:
print('Run: python -m spacy download en_core_web_sm')import nltk
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np, json
documents = [
'Apple launches revolutionary new iPhone with AI features.',
'Google DeepMind achieves breakthrough in protein folding.',
'Microsoft Azure reports 40% growth in cloud services revenue.',
]
def analytics_report(docs: list) -> dict:
sia = SentimentIntensityAnalyzer()
# TODO: compute total word count
# TODO: extract top 5 TF-IDF keywords
# TODO: compute average sentiment label
# TODO: return as dict
pass
print(json.dumps(analytics_report(documents), indent=2))Extract structured facts from unstructured text: named entities, relations, events, and key-value pairs using rule-based and model-based approaches.
import re
text = '''
Elon Musk founded SpaceX in 2002. Jeff Bezos founded Amazon in 1994.
Tim Cook joined Apple in 1998 and became CEO in 2011.
Sam Altman was appointed CEO of OpenAI in 2019.
'''
# Pattern: Person + founded/joined/became + Org + in + Year
founded_pat = re.compile(
r'([A-Z][a-z]+ [A-Z][a-z]+) (founded|joined|became \w+) ([A-Z][a-zA-Z]+) in (\d{4})'
)
relations = []
for m in founded_pat.finditer(text):
relations.append({
'person': m.group(1),
'relation': m.group(2),
'org': m.group(3),
'year': m.group(4)
})
for r in relations:
print(f"{r['person']} --[{r['relation']}]--> {r['org']} ({r['year']})")try:
import spacy
nlp = spacy.load('en_core_web_sm')
text = 'Apple acquired Beats Electronics for 3 billion dollars in 2014. Google bought YouTube in 2006.'
doc = nlp(text)
print('Named Entities:')
for ent in doc.ents:
print(f' {ent.text!r:25s} -> {ent.label_}')
print('\nAcquisition relations (nsubj + dobj pattern):')
for token in doc:
if token.lemma_ in ('acquire', 'buy', 'purchase'):
subj = [t.text for t in token.lefts if t.dep_ in ('nsubj', 'nsubjpass')]
obj = [t.text for t in token.rights if t.dep_ in ('dobj', 'attr')]
if subj and obj:
print(f' {subj[0]} --[{token.text}]--> {obj[0]}')
except OSError:
print('Run: python -m spacy download en_core_web_sm')import re
from dataclasses import dataclass, field
from typing import List
@dataclass
class JobPosting:
title: str = ''
company: str = ''
location: str = ''
salary: str = ''
skills: List[str] = field(default_factory=list)
def extract_job_info(text: str) -> JobPosting:
job = JobPosting()
if m := re.search(r'(?:Title|Position|Role):\s*(.+)', text, re.I): job.title = m.group(1).strip()
if m := re.search(r'Company:\s*(.+)', text, re.I): job.company = m.group(1).strip()
if m := re.search(r'Location:\s*(.+)', text, re.I): job.location = m.group(1).strip()
if m := re.search(r'Salary:\s*(\$[\d,]+ ?- ?\$[\d,]+|\$[\d,]+)', text, re.I): job.salary = m.group(1)
skills_m = re.findall(r'\b(Python|SQL|Java|TensorFlow|PyTorch|Docker|Kubernetes|AWS|GCP)\b', text)
job.skills = list(set(skills_m))
return job
posting = '''
Title: Senior Data Scientist
Company: TechCorp Inc.
Location: San Francisco, CA
Salary: $150,000 - $200,000
Requirements: Python, SQL, TensorFlow, Docker, AWS experience preferred.
'''
job = extract_job_info(posting)
print(f'Title: {job.title}')
print(f'Company: {job.company}')
print(f'Location: {job.location}')
print(f'Salary: {job.salary}')
print(f'Skills: {sorted(job.skills)}')import re
from collections import defaultdict
# Simple event extraction using trigger words
EVENT_TRIGGERS = {
'acquisition': ['acquired', 'bought', 'purchased', 'merged with', 'took over'],
'funding': ['raised', 'secured', 'received funding', 'closed round'],
'launch': ['launched', 'released', 'unveiled', 'announced', 'introduced'],
'partnership': ['partnered', 'collaborated', 'teamed up', 'joined forces'],
}
MONEY_RE = re.compile(r'\$[\d.,]+[BMK]?\s*(?:billion|million|thousand)?', re.I)
def extract_events(sentences):
events = []
for sent in sentences:
sent_lower = sent.lower()
for etype, triggers in EVENT_TRIGGERS.items():
for trig in triggers:
if trig in sent_lower:
money = MONEY_RE.findall(sent)
events.append({'type': etype, 'trigger': trig, 'money': money, 'text': sent[:80]})
break
return events
news = [
'Google acquired DeepMind for $500 million in 2014.',
'OpenAI raised $6.6 billion in its latest funding round.',
'Apple launched its Vision Pro headset at WWDC 2023.',
'Meta and Microsoft partnered on enterprise AI solutions.',
]
for ev in extract_events(news):
print(f"[{ev['type'].upper()}] trigger='{ev['trigger']}' money={ev['money']}")
print(f" {ev['text']}")import re
from typing import Optional
def extract_contract_metadata(text: str) -> dict:
result = {}
# Party extraction: 'between X and Y'
party_m = re.search(r'between\s+([^,]+?)\s+and\s+([^,\.]+)', text, re.I)
if party_m:
result['party_1'] = party_m.group(1).strip()
result['party_2'] = party_m.group(2).strip()
# Date extraction
date_m = re.findall(r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4})\b', text, re.I)
result['dates'] = date_m[:3]
# Amount extraction
amounts = re.findall(r'\$[\d,]+(?:\.\d{2})?(?:\s*(?:million|billion|thousand))?', text, re.I)
result['amounts'] = amounts
# Obligation keywords
obligations = re.findall(r'\b(shall|must|will|agrees to|is required to)\b', text, re.I)
result['obligation_count'] = len(obligations)
return result
contract = '''
This agreement is entered into between Acme Corporation and Beta Ltd.
Effective January 15, 2024. The total value is $500,000.00.
Acme shall deliver the software by March 31, 2024.
Beta must pay within 30 days of invoice.
'''
print(extract_contract_metadata(contract))import re
LANGUAGES = ['Python', 'Java', 'C++', 'JavaScript', 'SQL', 'R', 'Go', 'Rust', 'Scala']
def extract_resume(text: str) -> dict:
# TODO: extract name (first non-empty line)
# TODO: extract email
# TODO: extract phone
# TODO: extract years of experience
# TODO: extract mentioned programming languages
pass
resume = '''
Jane Doe
jane.doe@email.com | +1-555-123-4567
5 years of experience in data engineering.
Skills: Python, SQL, Scala, Apache Spark.
'''
print(extract_resume(resume))Understand encoder-decoder architectures, attention mechanisms, and BLEU scoring. Implement simple character-level and word-level translation concepts.
from collections import Counter
import math
def ngram_counts(tokens, n):
return Counter(tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))
def bleu_score(reference: str, hypothesis: str, max_n: int = 4) -> float:
ref_tokens = reference.lower().split()
hyp_tokens = hypothesis.lower().split()
if not hyp_tokens:
return 0.0
# Brevity penalty
bp = min(1.0, math.exp(1 - len(ref_tokens)/len(hyp_tokens)))
scores = []
for n in range(1, max_n + 1):
ref_ng = ngram_counts(ref_tokens, n)
hyp_ng = ngram_counts(hyp_tokens, n)
clipped = sum(min(c, ref_ng[ng]) for ng, c in hyp_ng.items())
total = sum(hyp_ng.values())
if total == 0:
scores.append(0.0)
else:
scores.append(clipped / total)
# Geometric mean of precisions
log_avg = sum(math.log(s) if s > 0 else -999 for s in scores) / max_n
bleu = bp * math.exp(log_avg)
print(f'Reference: {reference}')
print(f'Hypothesis: {hypothesis}')
print(f'N-gram precisions: {[round(s,3) for s in scores]}')
print(f'BLEU-{max_n}: {bleu:.4f}')
return bleu
bleu_score(
'The cat sat on the mat',
'The cat is on the mat'
)
bleu_score(
'The cat sat on the mat',
'A dog lay on a rug'
)import numpy as np
# Demonstrate encoder-decoder idea without deep learning framework
np.random.seed(42)
# Toy vocabulary
vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, 'hello': 3, 'world': 4,
'hola': 5, 'mundo': 6, 'bonjour': 7, 'monde': 8}
idx2word = {v: k for k, v in vocab.items()}
# Random embeddings (in practice: learned)
EMB_DIM = 8
embeddings = np.random.randn(len(vocab), EMB_DIM) * 0.1
def encode(sentence: str) -> np.ndarray:
tokens = [vocab.get(w, 0) for w in sentence.lower().split()]
vecs = [embeddings[t] for t in tokens]
return np.mean(vecs, axis=0) # mean pooling = context vector
def cosine_sim(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)
src = encode('hello world')
print('Encoder output (context vector):', src.round(3))
# In real seq2seq: decoder generates target tokens one by one
# conditioned on context vector + previous generated token
print('\nEncoder-Decoder flow:')
print(' Source: hello world')
print(' Encoder -> context vector (shape:', src.shape, ')')
print(' Decoder: <sos> -> hola -> mundo -> <eos>')
print(' At each step: P(word | context, prev_token)')import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
# Simulate attention weights for 'The cat sat on the mat' -> 'Le chat etait sur le tapis'
np.random.seed(42)
src_words = ['The', 'cat', 'sat', 'on', 'the', 'mat']
tgt_words = ['Le', 'chat', 'etait', 'sur', 'le', 'tapis']
# Simulate attention weights (in practice: softmax(Q @ K.T / sqrt(d_k)))
# Diagonal-dominant = good alignment
raw = np.random.rand(6, 6)
# Make it more diagonal (word alignments)
for i in range(6):
raw[i, i] += 2.0
raw[0, 4] += 1.0 # 'the' aligns with 'le'
attention = np.exp(raw) / np.exp(raw).sum(axis=1, keepdims=True)
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(attention, cmap='Blues', vmin=0, vmax=1)
ax.set_xticks(range(6)); ax.set_xticklabels(src_words, rotation=45, ha='right')
ax.set_yticks(range(6)); ax.set_yticklabels(tgt_words)
ax.set_xlabel('Source'); ax.set_ylabel('Target')
ax.set_title('Attention Weights')
plt.colorbar(im, ax=ax); plt.tight_layout()
plt.savefig('attention_weights.png', dpi=80); plt.close()
print('Saved attention_weights.png')
print('Attention row sums:', attention.sum(axis=1).round(3))try:
from transformers import pipeline
# Zero-shot: use a pretrained translation model
translator = pipeline('translation_en_to_fr', model='Helsinki-NLP/opus-mt-en-fr')
sentences = [
'Machine learning is transforming healthcare.',
'The quick brown fox jumps over the lazy dog.',
'Data science requires statistics, programming, and domain knowledge.',
]
for sent in sentences:
result = translator(sent, max_length=128)[0]['translation_text']
print(f'EN: {sent}')
print(f'FR: {result}\n')
except ImportError:
print('pip install transformers sentencepiece')
print('\nExample output:')
print('EN: Machine learning is transforming healthcare.')
print('FR: L apprentissage automatique transforme les soins de sante.')from collections import Counter
import math
def simple_bleu(ref: str, hyp: str) -> float:
ref_t = ref.lower().split(); hyp_t = hyp.lower().split()
if not hyp_t: return 0.0
bp = min(1.0, math.exp(1 - len(ref_t)/len(hyp_t)))
scores = []
for n in range(1, 3):
ref_ng = Counter(tuple(ref_t[i:i+n]) for i in range(len(ref_t)-n+1))
hyp_ng = Counter(tuple(hyp_t[i:i+n]) for i in range(len(hyp_t)-n+1))
clip = sum(min(c, ref_ng[ng]) for ng, c in hyp_ng.items())
total = sum(hyp_ng.values()) or 1
scores.append(clip/total)
log_avg = sum(math.log(s) if s > 0 else -9 for s in scores) / 2
return bp * math.exp(log_avg)
# Simulate auto-translations and quality check
pairs = [
('Machine learning improves efficiency', 'L apprentissage automatique ameliore l efficacite'),
('High quality product at low price', 'Produit cher mauvais qualite'), # bad
('Fast delivery guaranteed', 'Livraison rapide garantie'),
]
THRESHOLD = 0.35
for en, fr in pairs:
score = simple_bleu(en, fr)
flag = 'REVIEW' if score < THRESHOLD else 'OK'
print(f'[{flag}] BLEU={score:.3f} | {fr[:50]}')from collections import Counter
import math
def bleu_n(ref: str, hyp: str, n: int) -> float:
# TODO: compute BLEU-n precision with clipping
pass
def evaluate_translations(pairs):
# pairs: list of (reference, hypothesis) tuples
# TODO: compute BLEU-1 and BLEU-2 for each pair
# TODO: flag pairs below threshold 0.4
# TODO: print results and corpus average
pass
test_pairs = [
('The cat sat on the mat', 'The cat is on the mat'),
('Hello world how are you', 'Hi earth what is up'),
('Data science is exciting', 'Data science is fascinating and rewarding'),
]
evaluate_translations(test_pairs)Build retrieval systems using TF-IDF, BM25, and dense vector search. Implement a basic Retrieval-Augmented Generation pipeline combining a retriever with a language model.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
corpus = [
'Python is a versatile programming language for data science and web development.',
'Machine learning models require large amounts of training data.',
'Neural networks are inspired by the structure of the human brain.',
'Natural language processing enables computers to understand human text.',
'Deep learning achieves state-of-the-art results on image classification tasks.',
'Reinforcement learning trains agents through rewards and penalties.',
'Transfer learning fine-tunes pre-trained models on new tasks.',
'Transformers use self-attention to process sequences in parallel.',
]
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)
def search(query: str, top_k: int = 3) -> list:
q_vec = vectorizer.transform([query])
sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
top = np.argsort(sims)[::-1][:top_k]
return [(corpus[i][:70], round(sims[i], 4)) for i in top]
for q in ['how do neural networks learn', 'NLP text processing', 'image recognition']:
print(f'Query: {q}')
for doc, score in search(q):
print(f' [{score:.4f}] {doc}')
print()import numpy as np
from collections import Counter
import math
class BM25:
def __init__(self, corpus, k1=1.5, b=0.75):
self.corpus = [doc.lower().split() for doc in corpus]
self.k1, self.b = k1, b
self.n = len(self.corpus)
self.avgdl = np.mean([len(d) for d in self.corpus])
self.df = {}
for doc in self.corpus:
for term in set(doc):
self.df[term] = self.df.get(term, 0) + 1
def score(self, query: str, doc_id: int) -> float:
query_terms = query.lower().split()
doc = self.corpus[doc_id]
doc_len = len(doc)
tf = Counter(doc)
score = 0.0
for term in query_terms:
if term not in self.df: continue
idf = math.log((self.n - self.df[term] + 0.5) / (self.df[term] + 0.5) + 1)
freq = tf.get(term, 0)
tf_score = freq * (self.k1 + 1) / (freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl))
score += idf * tf_score
return score
def retrieve(self, query: str, top_k: int = 3):
scores = [(i, self.score(query, i)) for i in range(self.n)]
return sorted(scores, key=lambda x: -x[1])[:top_k]
docs = ['Python machine learning tutorial', 'Deep neural network architectures', 'Python web scraping guide', 'Transformer models for NLP', 'Data science with Python pandas']
bm25 = BM25(docs)
print('BM25 results for "Python NLP":')
for idx, score in bm25.retrieve('Python NLP'):
print(f' [{score:.3f}] {docs[idx]}')import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# Simulate sentence embeddings (in practice: use SentenceTransformer)
np.random.seed(42)
docs = [
'How to train a neural network',
'Python list comprehension tutorial',
'Best practices for REST API design',
'Introduction to gradient descent optimization',
'SQL window functions explained',
'Backpropagation algorithm explained',
]
# Simulate embeddings (normally: model.encode(docs))
# Make 'neural network' and 'gradient descent' semantically similar
EMB_DIM = 16
base_embs = np.random.randn(len(docs), EMB_DIM)
# Make neural network docs cluster together
for i in [0, 3, 5]:
base_embs[i] += np.array([2]*4 + [0]*12) # shared direction
base_embs /= np.linalg.norm(base_embs, axis=1, keepdims=True)
def dense_search(query_emb, doc_embs, top_k=3):
sims = cosine_similarity(query_emb.reshape(1,-1), doc_embs).flatten()
top = np.argsort(sims)[::-1][:top_k]
return [(docs[i], sims[i]) for i in top]
# Query embedding (similar to NN docs)
query_emb = base_embs[0] + np.random.randn(EMB_DIM) * 0.1
query_emb /= np.linalg.norm(query_emb)
print('Dense search results for [neural network query]:')
for doc, sim in dense_search(query_emb, base_embs):
print(f' [{sim:.3f}] {doc}')from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Knowledge base
knowledge_base = [
'Python was created by Guido van Rossum in 1991.',
'NumPy provides N-dimensional array support and math functions.',
'Pandas is built on NumPy and provides DataFrame structures for data analysis.',
'Scikit-learn offers machine learning algorithms for classification, regression, and clustering.',
'Matplotlib is the most popular Python plotting library.',
'TensorFlow and PyTorch are the two leading deep learning frameworks.',
]
vect = TfidfVectorizer(stop_words='english')
kb_matrix = vect.fit_transform(knowledge_base)
def retrieve(query: str, top_k: int = 2) -> list:
q_vec = vect.transform([query])
sims = cosine_similarity(q_vec, kb_matrix).flatten()
top = np.argsort(sims)[::-1][:top_k]
return [knowledge_base[i] for i in top]
def rag_answer(query: str) -> str:
"""Minimal RAG: retrieve context, then format answer."""
context = retrieve(query)
context_str = ' '.join(context)
# In real RAG: pass context + query to LLM (e.g. Claude/GPT)
# Here: template-based answer simulation
answer = f'Based on retrieved context: {context_str[:120]}...'
return answer
queries = ['What is pandas?', 'Who created Python?', 'deep learning frameworks']
for q in queries:
print(f'Q: {q}')
print(f'Retrieved: {retrieve(q)[0][:60]}')
print()from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Company policy documents
documents = {
'vacation': 'Employees are entitled to 20 days of paid vacation per year. Unused days can be carried over to the next year up to 5 days.',
'remote': 'Remote work is allowed up to 3 days per week. Core hours are 10am-3pm in the employee home timezone.',
'expenses': 'Business expenses must be submitted within 30 days with receipts. Meals are reimbursed up to $50 per day.',
'equipment': 'New employees receive a MacBook Pro and $500 equipment budget. Replacements require manager approval.',
'onboarding': 'New employees complete a 2-week onboarding program including security training and team introductions.',
}
doc_texts = list(documents.values())
doc_keys = list(documents.keys())
vect = TfidfVectorizer(stop_words='english')
matrix = vect.fit_transform(doc_texts)
def answer_question(query: str, top_k: int = 2) -> str:
sims = cosine_similarity(vect.transform([query]), matrix).flatten()
top = np.argsort(sims)[::-1][:top_k]
context = ' '.join(doc_texts[i] for i in top)
return f'[Context: {context[:200]}...]'
for q in ['How many vacation days do I get?', 'Can I work from home?', 'expense reimbursement policy']:
print(f'Q: {q}')
print(f'A: {answer_question(q)[:120]}')
print()from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
faqs = [
('What are your business hours?', 'We are open Monday to Friday, 9am to 6pm EST.'),
('How do I reset my password?', 'Click Forgot Password on the login page and follow the email instructions.'),
('What payment methods do you accept?', 'We accept Visa, Mastercard, PayPal, and bank transfers.'),
('How long does shipping take?', 'Standard shipping takes 5-7 business days. Express ships in 2 days.'),
('Can I return a product?', 'Yes, returns are accepted within 30 days with original packaging.'),
]
# TODO: Fit TfidfVectorizer on FAQ questions
# TODO: For each user query, find most similar FAQ and return answer
# TODO: Print query, matched question, similarity score, and answer
user_queries = ['office hours', 'forgot my login', 'how to send back item']
# TODO: process each query
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
texts = [
"The stock market crashed on Monday.",
"Transformers revolutionized NLP in 2017!"
]
encoded = tokenizer(texts, padding=True, truncation=True, max_length=32, return_tensors="pt")
print("Input IDs shape:", encoded["input_ids"].shape)
for i, text in enumerate(texts):
tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][i])
tokens = [t for t in tokens if t != "[PAD]"]
print(f"\nText {i+1}: {tokens}")
print("\nVocab size:", tokenizer.vocab_size)from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
reviews = [
"This product exceeded all my expectations! Absolutely fantastic.",
"Terrible quality. Broke after one day. Very disappointed.",
"It is okay, nothing special but gets the job done.",
]
results = classifier(reviews)
for text, result in zip(reviews, results):
label = result["label"]
score = result["score"]
print(f"[{label} {score:.3f}] {text[:50]}...")from transformers import pipeline
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli")
text = "The Federal Reserve raised interest rates by 25 basis points today."
candidate_labels = ["finance", "sports", "technology", "politics", "health"]
result = classifier(text, candidate_labels)
print("Text:", text[:70])
print("\nClassification scores:")
for label, score in zip(result["labels"], result["scores"]):
bar = "#" * int(score * 30)
print(f" {label:<12} {score:.4f} {bar}")from transformers import pipeline
# Zero-shot ticket router
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli")
tickets = [
"My credit card was charged twice for the same order.",
"The app keeps crashing whenever I try to open settings.",
"I want to return the shoes I bought last week. They don't fit.",
"When will my order arrive? It's been two weeks.",
"I forgot my password and the reset link doesn't work.",
]
departments = ["billing", "technical support", "returns & refunds", "shipping", "account access"]
print("Support Ticket Routing")
print("=" * 60)
for ticket in tickets:
result = classifier(ticket, departments, multi_label=False)
top_dept = result["labels"][0]
top_score = result["scores"][0]
print(f"Ticket: {ticket[:55]}...")
print(f" -> {top_dept} ({top_score:.3f})")
print()from transformers import pipeline
# News headlines to classify
headlines = [
"SpaceX successfully lands reusable rocket for 20th time.",
"Champions League final set as Real Madrid beats Bayern Munich.",
"Senate votes to pass new climate legislation bill.",
"Apple unveils new M4 chip with enhanced neural processing.",
"GDP growth slows to 1.2% amid rising inflation concerns.",
]
categories = ["politics", "sports", "technology", "science", "entertainment", "business"]
# TODO: Zero-shot classify each headline into categories
# TODO: Run sentiment analysis on each headline
# TODO: Print formatted table: headline | category | sentiment | scores
import spacy
nlp = spacy.load("en_core_web_sm")
texts = [
"Apple Inc. CEO Tim Cook announced new products at WWDC in San Francisco.",
"On March 14, 2023, the Fed raised rates by 25bps, affecting $4.5T in bonds.",
]
for text in texts:
doc = nlp(text)
print(f"Text: {text[:65]}...")
print("Entities:")
for ent in doc.ents:
print(f" [{ent.label_:<10}] '{ent.text}'")
print()import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Match product codes like "SKU-12345" or "PROD-ABC99"
pattern = [{"TEXT": {"REGEX": r"(SKU|PROD|ITEM)-[A-Z0-9]{3,8}"}}]
matcher.add("PRODUCT_CODE", [pattern])
texts = [
"Customer ordered SKU-48291 and PROD-XR99 but ITEM-ZZ001 was out of stock.",
"Return request for SKU-11100 received from warehouse.",
]
for text in texts:
doc = nlp(text)
matches = matcher(doc)
codes = [doc[start:end].text for _, start, end in matches]
print(f"Text: {text}")
print(f"Product codes found: {codes}\n")import spacy
nlp = spacy.load("en_core_web_sm")
text = "Elon Musk founded SpaceX in 2002. Jeff Bezos started Amazon in 1994."
doc = nlp(text)
print("Subject-Verb-Object triples:")
for sent in doc.sents:
for token in sent:
if token.dep_ == "ROOT":
subj = [c.text for c in token.children if c.dep_ in ("nsubj","nsubjpass")]
obj = [c.text for c in token.children if c.dep_ in ("dobj","attr","pobj")]
if subj and obj:
print(f" ({subj[0]}) --[{token.text}]--> ({obj[0]})")
print("\nNamed Entities:")
for ent in doc.ents:
print(f" {ent.text:<15} [{ent.label_}]")import spacy
import re
nlp = spacy.load("en_core_web_sm")
contract_text = """
This Service Agreement is entered into on January 15, 2024, between
Acme Corporation, a Delaware company ("Client"), and TechSolutions LLC,
a California limited liability company ("Provider"). Client agrees to pay
Provider $12,500 per month for software development services. The agreement
terminates on December 31, 2024. Acme Corporation is headquartered in
New York, NY. Either party may terminate with 30 days written notice.
"""
doc = nlp(contract_text)
# Extract entities by type
parties = []
dates = []
money = []
for ent in doc.ents:
if ent.label_ == "ORG":
parties.append(ent.text)
elif ent.label_ == "DATE":
dates.append(ent.text)
elif ent.label_ == "MONEY":
money.append(ent.text)
print("Contract Extraction Report")
print(f"Parties: {list(set(parties))}")
print(f"Dates: {dates}")
print(f"Amounts: {money}")
# Extract obligations with regex on sentence level
for sent in doc.sents:
if any(w in sent.text.lower() for w in ["agrees", "shall", "must", "terminates"]):
print(f"Obligation: {sent.text.strip()[:80]}")import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
resume = """
John Smith is a Senior Data Scientist with 8 years of experience at Google and Microsoft.
He specializes in Python, TensorFlow, and SQL. Previously, he was a Machine Learning Engineer
at Amazon, where he led a team of 5 researchers. He holds a PhD from MIT in Computer Science.
"""
matcher = Matcher(nlp.vocab)
# TODO: Pattern for job titles (e.g., "Senior Data Scientist", "Machine Learning Engineer")
# TODO: Pattern for tech skills (capitalized 1-3 word terms)
# TODO: Extract PERSON, ORG, DATE, CARDINAL entities
# TODO: Output structured dict: name, companies, titles, skills, experience_years
from transformers import pipeline
generator = pipeline("text-generation", model="gpt2", max_new_tokens=60)
prompts = [
"The future of artificial intelligence is",
"In 2035, data scientists will",
]
for prompt in prompts:
outputs = generator(prompt, num_return_sequences=2, temperature=0.8, do_sample=True)
print(f"Prompt: {prompt}")
for i, out in enumerate(outputs, 1):
generated = out["generated_text"][len(prompt):]
print(f" [{i}] ...{generated[:80]}")
print()# Demonstrates prompt engineering patterns (no API key needed — shows templates)
import json
def build_extraction_prompt(text, fields):
field_list = ", ".join(f'"{f}"' for f in fields)
return f"""Extract the following fields from the text below.
Return ONLY valid JSON with keys: {field_list}.
If a field is not found, use null.
Text: {text}
JSON output:"""
texts = [
"Order #4521 placed by Sarah Johnson on 2024-03-15 for $289.99. Ships to Chicago, IL.",
"Meeting scheduled with Dr. Patel at Boston General Hospital on Tuesday at 2pm."
]
fields_order = ["order_id", "customer_name", "date", "amount", "city"]
fields_meeting = ["person", "organization", "day", "time"]
for text, fields in zip(texts, [fields_order, fields_meeting]):
prompt = build_extraction_prompt(text, fields)
print("=== Prompt Template ===")
print(prompt[:200])
print("...")
print()from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn",
max_length=60, min_length=20, do_sample=False)
article = """
Scientists at MIT have developed a new AI system capable of predicting protein
folding structures with greater accuracy than any previous model. The breakthrough,
published in Nature, combines deep learning with molecular dynamics simulations.
Researchers tested the system on over 10,000 known protein structures and achieved
98.5% accuracy. This development could accelerate drug discovery by enabling
researchers to design proteins that target specific disease pathways. The team plans
to make the model open-source within the next six months.
"""
summary = summarizer(article.strip())[0]["summary_text"]
original_words = len(article.split())
summary_words = len(summary.split())
print(f"Original: {original_words} words")
print(f"Summary ({summary_words} words):")
print(summary)from transformers import pipeline
# Multi-stage NLP pipeline: classify -> summarize -> route
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
zero_shot = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
posts = [
"This is an amazing product! I love the design and performance.",
"I hate this company. They stole my money and won't respond.",
"How do I reset my password? I can't log in to my account.",
"WARNING: This is a scam. Do NOT buy from this seller!!",
]
severity_labels = ["urgent - requires immediate review", "moderate - review within 24h", "low - can be auto-resolved"]
print("Content Moderation Pipeline")
print("=" * 60)
for post in posts:
sentiment = classifier(post)[0]
severity = zero_shot(post, severity_labels)["labels"][0]
print(f"Post: {post[:55]}...")
print(f" Sentiment: {sentiment['label']} ({sentiment['score']:.2f})")
print(f" Severity: {severity}")
print()from transformers import pipeline
import spacy
nlp_sp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn",
max_length=80, min_length=30, do_sample=False)
article1 = """[Article 1: 100+ words on climate change - fill in]""""
article2 = """[Article 2: 100+ words on climate policy - fill in]""""
article3 = """[Article 3: 100+ words on renewable energy - fill in]""""
articles = [article1, article2, article3]
# TODO: Summarize each article individually
# TODO: Concatenate summaries and create meta-summary
# TODO: Compute compression ratios at each stage
# TODO: Extract noun chunks from meta-summary with spaCy
NER identifies and classifies named entities (persons, organizations, locations, dates) in text using spaCy or Transformers.
import spacy
from collections import Counter
# Load spaCy model (run: python -m spacy download en_core_web_sm)
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
print("Run: python -m spacy download en_core_web_sm")
nlp = None
text = (
"Apple Inc. CEO Tim Cook announced in San Francisco on January 15, 2024 "
"that the company would invest $1 billion in AI research. "
"The partnership with OpenAI and Microsoft was confirmed by Google."
)
if nlp:
doc = nlp(text)
print("Named Entities:")
for ent in doc.ents:
print(f" {ent.text:<30} [{ent.label_}] - {spacy.explain(ent.label_)}")
# Count entity types
type_counts = Counter(ent.label_ for ent in doc.ents)
print("\nEntity type counts:", dict(type_counts))
else:
# Simulate output structure
entities = [
("Apple Inc.", "ORG", "Companies"), ("Tim Cook", "PERSON", "People"),
("San Francisco", "GPE", "Geo-political"), ("January 15, 2024", "DATE", "Dates"),
("$1 billion", "MONEY", "Monetary"), ("OpenAI", "ORG", "Companies"),
("Microsoft", "ORG", "Companies"), ("Google", "ORG", "Companies"),
]
for text_ent, label, explain in entities:
print(f" {text_ent:<30} [{label}] - {explain}")
import spacy
from spacy.language import Language
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
nlp = spacy.blank("en")
# Add EntityRuler for custom entities
ruler = nlp.add_pipe("entity_ruler", before="ner" if "ner" in nlp.pipe_names else "last")
# Define custom patterns
patterns = [
{"label": "ML_MODEL", "pattern": "BERT"},
{"label": "ML_MODEL", "pattern": "GPT-4"},
{"label": "ML_MODEL", "pattern": "ResNet"},
{"label": "ML_TASK", "pattern": "named entity recognition"},
{"label": "ML_TASK", "pattern": "sentiment analysis"},
{"label": "DATASET", "pattern": [{"LOWER": "imagenet"}]},
{"label": "DATASET", "pattern": "CIFAR-10"},
]
ruler.add_patterns(patterns)
test_texts = [
"BERT and GPT-4 are popular ML models for named entity recognition.",
"ResNet was trained on ImageNet and CIFAR-10 datasets.",
"sentiment analysis using BERT achieves state-of-the-art results.",
]
for text in test_texts:
doc = nlp(text)
ents = [(e.text, e.label_) for e in doc.ents]
print(f"Text: {text[:50]}...")
print(f" Entities: {ents}\n")
import re
from collections import defaultdict
# Rule-based NER for support tickets (when spaCy not available)
PATTERNS = {
"PRODUCT": [r"\b(Model-[A-Z]\d+|Product-\w+|SKU-\d+)\b"],
"ERROR_CODE": [r"\bERR-?\d{3,5}\b", r"\bError\s+\d{3,5}\b"],
"TICKET_ID": [r"\b(TKT|TICKET)-?\d{5,8}\b"],
"VERSION": [r"\bv\d+\.\d+(\.\d+)?\b"],
}
def extract_entities(text):
entities = defaultdict(list)
for label, pats in PATTERNS.items():
for pat in pats:
matches = re.findall(pat, text, re.IGNORECASE)
if matches:
flat = [m if isinstance(m, str) else m[0] for m in matches]
entities[label].extend(flat)
return dict(entities)
tickets = [
"TKT-123456: Customer reports ERR-4042 on Model-X9 running v2.3.1",
"TICKET-99887: SKU-A2B3C4 throws Error 500 after upgrade to v3.0.0",
"TKT00112233: Product-Premium shows ERR4001 and ERR4002 intermittently",
]
for ticket in tickets:
ents = extract_entities(ticket)
print(f"Ticket: {ticket}")
print(f" Entities: {ents}\n")
import re
text = "John Smith joined Acme Corp on 2024-01-15 and Microsoft on 2024-03-20."
dates = re.findall(r"\d{4}-\d{2}-\d{2}", text)
names = re.findall(r"[A-Z][a-z]+ [A-Z][a-z]+", text)
print("Dates:", dates)
print("Names:", names)
Sentence embeddings convert text to dense vectors capturing semantic meaning, enabling similarity search beyond keyword matching.
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Document corpus
corpus = [
"machine learning algorithms for classification",
"deep learning neural networks for image recognition",
"natural language processing text classification",
"computer vision object detection algorithms",
"transformer models for text generation",
"reinforcement learning reward optimization",
]
# Build TF-IDF matrix
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(corpus)
print(f"TF-IDF matrix: {tfidf_matrix.shape}")
# Semantic search function
def search(query, top_k=3):
query_vec = vectorizer.transform([query])
scores = cosine_similarity(query_vec, tfidf_matrix)[0]
top_idx = np.argsort(scores)[::-1][:top_k]
return [(corpus[i], round(float(scores[i]), 4)) for i in top_idx]
queries = [
"text classification with deep learning",
"visual recognition algorithms",
]
for q in queries:
print(f"\nQuery: '{q}'")
for doc, score in search(q):
print(f" [{score:.4f}] {doc}")
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Create co-occurrence matrix (simplified Word2Vec concept)
sentences = [
"the cat sat on the mat",
"the dog lay on the rug",
"cats and dogs are pets",
"machine learning models learn patterns",
"deep learning uses neural networks",
"neural networks learn representations",
]
# Build co-occurrence proxy via SVD on count matrix
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(sentences)
# SVD to get dense embeddings (like word2vec)
svd = TruncatedSVD(n_components=8, random_state=42)
embeddings = svd.fit_transform(X)
print("Sentence embeddings shape:", embeddings.shape)
# Find similar sentences
def find_similar(idx, top_k=3):
sims = cosine_similarity([embeddings[idx]], embeddings)[0]
sims[idx] = -1 # exclude self
top = np.argsort(sims)[::-1][:top_k]
return [(sentences[i], round(float(sims[i]), 4)) for i in top]
for i in [0, 3]:
print(f"\nSimilar to: '{sentences[i]}'")
for sent, sim in find_similar(i):
print(f" [{sim:.4f}] {sent}")
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# FAQ knowledge base
faqs = [
("How do I reset my password?", "Click Forgot Password on login page, enter email, check inbox."),
("How to change account email?", "Go to Settings > Account > Email and enter new address."),
("Why is my payment failing?", "Check card details, billing address, or try a different card."),
("How to cancel my subscription?", "Go to Settings > Billing > Cancel Subscription."),
("How do I download my invoice?", "Settings > Billing > Invoice History > Download PDF."),
("Account locked after failed logins?", "Wait 30 minutes or contact support@example.com."),
]
questions = [q for q, _ in faqs]
answers = [a for _, a in faqs]
vec = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)
faq_matrix = vec.fit_transform(questions)
def answer_query(user_query, threshold=0.1):
q_vec = vec.transform([user_query])
scores = cosine_similarity(q_vec, faq_matrix)[0]
best_idx = np.argmax(scores)
if scores[best_idx] < threshold:
return "I couldn't find a relevant answer. Please contact support."
return f"[score={scores[best_idx]:.3f}] {answers[best_idx]}"
test_queries = [
"forgot my credentials",
"payment not working",
"stop my plan",
"get billing document",
]
for q in test_queries:
print(f"Q: {q}")
print(f"A: {answer_query(q)}\n")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
docs = ["python programming", "machine learning python", "deep learning neural nets"]
vec = TfidfVectorizer()
X = vec.fit_transform(docs)
query = vec.transform(["python learning"])
scores = cosine_similarity(query, X)[0]
print("Best match:", docs[np.argmax(scores)])
Hugging Face Transformers provides thousands of pretrained models for text classification, generation, Q&A, and more via a unified API.
# pip install transformers torch
# Using Hugging Face pipeline (simulated output shown)
try:
from transformers import pipeline
# Zero-shot classification (no fine-tuning needed)
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli")
texts = [
"The product quality is excellent and delivery was fast!",
"Terrible service, waited 3 weeks and got wrong item.",
"The item is okay, nothing special but works as expected.",
]
candidate_labels = ["positive", "negative", "neutral"]
for text in texts:
result = classifier(text, candidate_labels)
top_label = result["labels"][0]
top_score = result["scores"][0]
print(f"Text: {text[:50]}...")
print(f" -> {top_label} (score={top_score:.4f})")
except ImportError:
# Simulate output structure
results = [
("positive", 0.9823), ("negative", 0.9541), ("neutral", 0.7234)
]
texts = ["Excellent product!", "Terrible service.", "Works okay."]
for (text, (label, score)) in zip(texts, results):
print(f"Text: {text}")
print(f" -> {label} (score={score:.4f})")
# Token classification and feature extraction patterns
try:
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
# NER pipeline
ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english",
aggregation_strategy="simple")
text = "Apple CEO Tim Cook announced a deal with Microsoft in New York."
entities = ner(text)
for ent in entities:
print(f" {ent['word']:<20} {ent['entity_group']:<10} score={ent['score']:.4f}")
except ImportError:
# Demonstrate the pipeline usage pattern
import numpy as np
print("Simulated NER output (install transformers for real results):")
entities = [
{"word": "Apple", "entity_group": "ORG", "score": 0.9987},
{"word": "Tim Cook", "entity_group": "PER", "score": 0.9945},
{"word": "Microsoft", "entity_group": "ORG", "score": 0.9978},
{"word": "New York", "entity_group": "LOC", "score": 0.9923},
]
for ent in entities:
print(f" {ent['word']:<20} {ent['entity_group']:<10} score={ent['score']:.4f}")
# Simulate sentence embeddings
print("\nSimulating sentence embeddings (mean pooling over tokens):")
batch_size, seq_len, hidden = 2, 128, 768
token_embeddings = np.random.randn(batch_size, seq_len, hidden)
sentence_embeddings = token_embeddings.mean(axis=1)
print(f" Input shape: {token_embeddings.shape}")
print(f" Sentence embedding shape: {sentence_embeddings.shape}")
# Using sklearn to simulate transformer-like text classification
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Simulated reviews dataset
reviews = [
("Great product, fast shipping, very satisfied!", "positive"),
("Amazing quality, exceeded expectations.", "positive"),
("Good value for money, would recommend.", "positive"),
("Works as described, happy with purchase.", "positive"),
("Terrible quality, broke after one day.", "negative"),
("Do not buy this, complete waste of money.", "negative"),
("Very disappointed, nothing like the description.", "negative"),
("Poor build quality, returned immediately.", "negative"),
("Item is okay, nothing special.", "neutral"),
("Average product, does the job.", "neutral"),
("Received the item, it works.", "neutral"),
("Product is fine, shipping was slow.", "neutral"),
]
texts, labels = zip(*reviews)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.33, random_state=42)
# Pipeline (simulates HF pipeline interface)
clf = Pipeline([
("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
("model", LogisticRegression(max_iter=1000, random_state=42)),
])
clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))
# Batch prediction (like HF pipeline)
new_reviews = [
"Absolutely love this product!",
"Received damaged, very unhappy.",
"It is what it is, does the job.",
]
for review in new_reviews:
pred = clf.predict([review])[0]
prob = max(clf.predict_proba([review])[0])
print(f" [{pred:>8}] ({prob:.3f}) {review}")
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
texts = ["I love this", "I hate this", "This is great", "This is terrible"]
labels = ["pos", "neg", "pos", "neg"]
clf = Pipeline([("tfidf", TfidfVectorizer()), ("nb", MultinomialNB())])
clf.fit(texts, labels)
print(clf.predict(["This is amazing"]))
Latent Dirichlet Allocation (LDA) discovers hidden topics in a text corpus by modeling documents as mixtures of topics.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
# Sample corpus
documents = [
"machine learning neural networks deep learning artificial intelligence",
"python programming data science numpy pandas matplotlib",
"stock market trading investment portfolio risk management",
"climate change global warming carbon emissions renewable energy",
"machine learning algorithms random forest gradient boosting",
"python web development flask django rest api",
"investment strategy hedge fund returns portfolio optimization",
"solar wind energy renewable green sustainability climate",
"deep learning computer vision image classification convolutional",
"data analysis pandas visualization matplotlib seaborn statistics",
]
# Fit LDA
vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words="english")
X = vectorizer.fit_transform(documents)
vocab = vectorizer.get_feature_names_out()
lda = LatentDirichletAllocation(n_components=3, random_state=42, max_iter=20)
lda.fit(X)
# Display top words per topic
print("Discovered Topics:")
for topic_id, topic in enumerate(lda.components_):
top_words = [vocab[i] for i in topic.argsort()[:-8:-1]]
print(f" Topic {topic_id+1}: {', '.join(top_words)}")
# Document-topic distribution
doc_topics = lda.transform(X)
for i, doc in enumerate(documents[:3]):
dominant = doc_topics[i].argmax() + 1
print(f"\nDoc {i+1}: Topic {dominant} dominant ({doc_topics[i].max():.3f})")
print(f" '{doc[:50]}'")
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
# Non-negative Matrix Factorization (often better coherence than LDA)
news_snippets = [
"president signed new economic policy tax reform bill congress",
"federal reserve interest rates inflation monetary policy",
"championship game football team playoffs season victory",
"basketball nba draft player trade contract signed",
"covid vaccine efficacy clinical trial approval fda",
"hospital treatment patient therapy drug clinical",
"election campaign voter poll candidate debate",
"tech company IPO stock shares market valuation",
"championship trophy league season playoffs basketball",
"interest rate hike federal bank economic growth",
]
vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf = vectorizer.fit_transform(news_snippets)
vocab = vectorizer.get_feature_names_out()
# NMF topic modeling
nmf = NMF(n_components=4, random_state=42)
W = nmf.fit_transform(tfidf) # document-topic
H = nmf.components_ # topic-word
print("NMF Topics (typically more coherent):")
topic_names = ["Politics", "Economy", "Sports", "Health"]
for i, (row, name) in enumerate(zip(H, topic_names)):
top_words = [vocab[j] for j in row.argsort()[:-6:-1]]
print(f" Topic {i+1} ({name}): {', '.join(top_words)}")
# Dominant topic per document
for i, doc in enumerate(news_snippets[:4]):
dominant = W[i].argmax()
print(f"\nDoc: '{doc[:45]}...'")
print(f" Dominant topic: {topic_names[dominant]} ({W[i].max():.3f})")
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
articles = [
"SpaceX launched new rocket to international space station moon mission",
"NASA astronauts complete spacewalk satellite deployment orbital",
"Python programming language machine learning framework scikit-learn",
"JavaScript web development React frontend component library",
"Olympic gold medal swimming athletics world record champion",
"NBA basketball playoffs championship team finals victory",
"AI language model chatbot GPT transformer neural network",
"deep learning computer vision object detection classification",
"marathon runner athletics world record broken championship",
"rocket launch satellite orbit space exploration mission",
"JavaScript TypeScript web app development framework React",
"NBA finals championship basketball playoffs season",
]
# Compare LDA vs NMF
count_vec = CountVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf_vec = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
X_count = count_vec.fit_transform(articles)
X_tfidf = tfidf_vec.fit_transform(articles)
n_topics = 3
models = {
"LDA": (LatentDirichletAllocation(n_components=n_topics, random_state=42), count_vec),
"NMF": (NMF(n_components=n_topics, random_state=42), tfidf_vec),
}
for model_name, (model, vec) in models.items():
X = vec.transform(articles)
W = model.fit_transform(X)
vocab = vec.get_feature_names_out()
H = model.components_
print(f"\n{model_name} Topics:")
for i, row in enumerate(H):
top_words = [vocab[j] for j in row.argsort()[:-5:-1]]
print(f" Topic {i+1}: {', '.join(top_words)}")
# Assign articles to topics
assignments = W.argmax(axis=1)
for topic in range(n_topics):
docs = [articles[j][:40] for j in range(len(articles)) if assignments[j] == topic]
print(f" Topic {topic+1} docs: {docs}")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
docs = ["cats dogs pets animals", "python code programming", "market stocks trading",
"dog cat pet animal friend", "code software developer", "stock market price"]
vec = CountVectorizer(stop_words="english")
X = vec.fit_transform(docs)
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)
vocab = vec.get_feature_names_out()
for i, topic in enumerate(lda.components_):
print(f"Topic {i+1}:", [vocab[j] for j in topic.argsort()[:-4:-1]])
Text summarization condenses long documents into shorter summaries, using extractive (key sentence selection) or abstractive (generation) approaches.
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
def extractive_summarize(text, n_sentences=3):
# Split into sentences
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
if len(sentences) <= n_sentences:
return text
# Score sentences by TF-IDF importance
vectorizer = TfidfVectorizer(stop_words="english")
try:
tfidf = vectorizer.fit_transform(sentences)
scores = np.array(tfidf.sum(axis=1)).flatten()
except ValueError:
return " ".join(sentences[:n_sentences])
# Select top sentences in original order
top_idx = sorted(np.argsort(scores)[-n_sentences:])
return " ".join(sentences[i] for i in top_idx)
article = (
"Machine learning is a subset of artificial intelligence that gives systems the ability "
"to automatically learn and improve from experience. "
"It focuses on developing computer programs that can access data and use it to learn for themselves. "
"Deep learning is part of machine learning based on artificial neural networks. "
"These networks have multiple layers and can learn representations of data with multiple levels of abstraction. "
"Natural language processing enables computers to understand human language. "
"Applications include chatbots, translation, and sentiment analysis. "
"Computer vision allows machines to interpret and understand visual information. "
"This includes image classification, object detection, and facial recognition. "
"Reinforcement learning trains agents through reward and penalty signals."
)
summary = extractive_summarize(article, n_sentences=3)
original_words = len(article.split())
summary_words = len(summary.split())
print(f"Original: {original_words} words")
print(f"Summary: {summary_words} words ({summary_words/original_words*100:.0f}% compression)")
print(f"\nSummary:\n{summary}")
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
def textrank_summarize(text, n_sentences=3, damping=0.85, iterations=50):
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
if len(sentences) <= n_sentences:
return " ".join(sentences)
# Build similarity matrix
vectorizer = TfidfVectorizer(stop_words="english")
try:
tfidf = vectorizer.fit_transform(sentences)
sim_matrix = cosine_similarity(tfidf)
except ValueError:
return " ".join(sentences[:n_sentences])
np.fill_diagonal(sim_matrix, 0)
# Row-normalize
row_sums = sim_matrix.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1
sim_matrix = sim_matrix / row_sums
# Power iteration (PageRank-style)
n = len(sentences)
scores = np.ones(n) / n
for _ in range(iterations):
scores = (1 - damping) / n + damping * sim_matrix.T @ scores
top_idx = sorted(np.argsort(scores)[-n_sentences:])
return " ".join(sentences[i] for i in top_idx)
text = (
"Python is a high-level programming language known for its simplicity. "
"It supports multiple programming paradigms including procedural, object-oriented, and functional. "
"Python is widely used in data science, machine learning, and web development. "
"The language has a rich ecosystem of libraries like NumPy, Pandas, and TensorFlow. "
"Its syntax is designed to be readable and concise, making it beginner-friendly. "
"Python runs on all major platforms and has an active open-source community."
)
for n in [2, 3]:
summary = textrank_summarize(text, n_sentences=n)
ratio = len(summary.split()) / len(text.split()) * 100
print(f"TextRank ({n} sentences, {ratio:.0f}% of original):")
print(f" {summary}\n")
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def hybrid_summarize(text, ratio=0.3):
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
n = max(1, int(len(sentences) * ratio))
vec = TfidfVectorizer(stop_words="english")
try:
X = vec.fit_transform(sentences)
except ValueError:
return sentences[0] if sentences else ""
# TF-IDF sentence scores
tfidf_scores = np.array(X.sum(axis=1)).flatten()
# Position scores (penalize later sentences slightly)
pos_scores = np.linspace(1.0, 0.5, len(sentences))
# Diversity: penalize similar sentences
sim = cosine_similarity(X)
diversity_scores = np.ones(len(sentences))
selected = []
final_scores = tfidf_scores * pos_scores
ranked = np.argsort(final_scores)[::-1]
for idx in ranked:
if len(selected) >= n:
break
# Check diversity
if not selected or all(sim[idx][s] < 0.7 for s in selected):
selected.append(idx)
selected_sorted = sorted(selected)
summary = " ".join(sentences[i] for i in selected_sorted)
return summary
# Simulate a legal document excerpt
legal_text = (
"This agreement is entered into between Party A and Party B on the date first written above. "
"Party A agrees to provide software development services as described in Schedule A. "
"Party B agrees to pay the fees outlined in Schedule B within 30 days of invoice. "
"All intellectual property developed under this agreement shall belong to Party B. "
"Party A warrants that services will be performed in a professional and workmanlike manner. "
"This agreement shall be governed by the laws of the State of California. "
"Either party may terminate this agreement with 30 days written notice. "
"Confidentiality obligations shall survive termination for a period of 2 years. "
"Any disputes shall be resolved through binding arbitration in San Francisco. "
"This agreement constitutes the entire understanding between the parties."
)
summary = hybrid_summarize(legal_text, ratio=0.4)
print(f"Original: {len(legal_text.split())} words, {len(legal_text.split('. '))} sentences")
print(f"Summary: {len(summary.split())} words")
print(f"\n{summary}")
import re
from collections import Counter
def summarize(text, n=2):
sents = re.split(r"[.!?]+", text)
sents = [s.strip() for s in sents if s.strip()]
words = Counter(text.lower().split())
scores = [sum(words[w.lower()] for w in s.split()) for s in sents]
top = sorted(range(len(sents)), key=lambda i: scores[i], reverse=True)[:n]
return ". ".join(sents[i] for i in sorted(top))
text = "Python is great. It is used in AI. AI is transforming industry. Python is simple."
print(summarize(text, 2))
QA systems find answers to questions within a context passage using span extraction, retrieval-augmented generation (RAG), or knowledge bases.
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class ExtractiveQA:
def __init__(self, passage_size=2):
self.passage_size = passage_size
self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
self.passages = []
self.passage_matrix = None
def index(self, text):
# Split into overlapping passages
sentences = re.split(r"(?<=[.!?])\s+", text.strip())
passages = []
for i in range(0, len(sentences), self.passage_size):
chunk = " ".join(sentences[i:i+self.passage_size])
if chunk.strip():
passages.append(chunk)
self.passages = passages
self.passage_matrix = self.vectorizer.fit_transform(passages)
def answer(self, question, top_k=1):
q_vec = self.vectorizer.transform([question])
scores = cosine_similarity(q_vec, self.passage_matrix)[0]
top_idx = np.argsort(scores)[::-1][:top_k]
return [(self.passages[i], round(float(scores[i]), 4)) for i in top_idx]
context = (
"Python was created by Guido van Rossum in 1991. "
"The language emphasizes code readability and simplicity. "
"Python 3.0 was released in 2008 with major changes from Python 2. "
"NumPy was created by Travis Oliphant in 2005. "
"Pandas was developed by Wes McKinney in 2008 for data manipulation. "
"Scikit-learn was released in 2007 and provides machine learning tools. "
"TensorFlow was developed by Google Brain team and released in 2015. "
"PyTorch was released by Facebook AI Research in 2016."
)
qa = ExtractiveQA(passage_size=2)
qa.index(context)
questions = [
"When was Python created?",
"Who created pandas?",
"When was TensorFlow released?",
]
for q in questions:
answer, score = qa.answer(q)[0]
print(f"Q: {q}")
print(f"A: {answer} [score={score}]\n")
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
# Simulated knowledge base (RAG knowledge store)
KB = {
"python_history": "Python was created by Guido van Rossum, first released in 1991. Python 3 was released in 2008.",
"ml_libraries": "Scikit-learn (2007), TensorFlow (2015), PyTorch (2016) are major ML libraries.",
"numpy_info": "NumPy provides N-dimensional array objects. It was created by Travis Oliphant in 2005.",
"pandas_info": "Pandas was created by Wes McKinney in 2008. It provides DataFrames for data manipulation.",
"deep_learning": "Deep learning uses neural networks with many layers. CNNs are used for images, RNNs for sequences.",
"transformers": "Transformers use attention mechanisms. BERT (2018) and GPT (2018) are key transformer models.",
}
# Build retrieval index
vectorizer = TfidfVectorizer(stop_words="english")
docs = list(KB.values())
keys = list(KB.keys())
index = vectorizer.fit_transform(docs)
def rag_answer(question, top_k=2):
q_vec = vectorizer.transform([question])
scores = cosine_similarity(q_vec, index)[0]
top_idx = np.argsort(scores)[::-1][:top_k]
context = " ".join(docs[i] for i in top_idx)
retrieved_keys = [keys[i] for i in top_idx]
# Simple extraction: find sentence most similar to question
sents = re.split(r"(?<=[.])\s+", context)
if not sents:
return "No answer found."
sent_vecs = vectorizer.transform(sents)
sent_scores = cosine_similarity(q_vec, sent_vecs)[0]
best_sent = sents[sent_scores.argmax()]
return {"answer": best_sent, "sources": retrieved_keys, "context": context[:100]}
questions = ["Who created NumPy?", "What is deep learning?", "When was BERT released?"]
for q in questions:
result = rag_answer(q)
print(f"Q: {q}")
print(f" Answer: {result['answer']}")
print(f" Sources: {result['sources']}\n")
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# HR Policy knowledge base
hr_policies = {
"vacation": "Employees receive 15 vacation days per year. Unused days roll over up to 5 days. Request via HR portal.",
"sick_leave": "12 sick days per year. Doctor certificate required for absences over 3 consecutive days.",
"remote_work": "Employees may work remotely up to 3 days per week with manager approval. Core hours: 10am-3pm.",
"expense_claims": "Submit expenses within 30 days of incurrence. Receipts required for amounts over $25.",
"performance_review": "Annual reviews in December. Mid-year check-ins in June. Ratings: Exceeds, Meets, Below expectations.",
"parental_leave": "16 weeks paid parental leave for primary caregivers. 4 weeks for secondary caregivers.",
"training_budget": "Each employee receives $1,500 annual training budget. Approval from manager required.",
"overtime": "Overtime must be pre-approved. Compensated at 1.5x rate for hours over 40/week.",
}
keys = list(hr_policies.keys())
docs = list(hr_policies.values())
vec = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
index = vec.fit_transform(docs)
def hr_qa(question, threshold=0.05):
q_vec = vec.transform([question])
scores = cosine_similarity(q_vec, index)[0]
best = scores.argmax()
if scores[best] < threshold:
return "Policy not found. Please contact HR directly."
policy_name = keys[best].replace("_", " ").title()
return f"[{policy_name}] {docs[best]}"
questions = [
"How many vacation days do I get?",
"Can I work from home?",
"How do I claim expenses?",
"How much training budget do I have?",
"What is the parental leave policy?",
]
for q in questions:
print(f"Q: {q}")
print(f"A: {hr_qa(q)}\n")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
passages = ["Python is easy to learn.", "NumPy is for arrays.", "Pandas is for data."]
vec = TfidfVectorizer()
X = vec.fit_transform(passages)
q = vec.transform(["What is NumPy?"])
scores = cosine_similarity(q, X)[0]
print("Answer:", passages[np.argmax(scores)])
Text generation produces fluent text continuations, completions, or creative content using n-gram models, Markov chains, or pretrained LLMs.
import random
from collections import defaultdict, Counter
import re
class NgramLM:
def __init__(self, n=2):
self.n = n
self.ngrams = defaultdict(Counter)
def train(self, texts):
for text in texts:
tokens = text.lower().split()
tokens = ["<s>"] * (self.n-1) + tokens + ["</s>"]
for i in range(len(tokens) - self.n + 1):
context = tuple(tokens[i:i+self.n-1])
next_token = tokens[i+self.n-1]
self.ngrams[context][next_token] += 1
def generate(self, max_len=20, seed=None):
if seed:
random.seed(seed)
context = ("<s>",) * (self.n-1)
tokens = []
for _ in range(max_len):
if context not in self.ngrams:
break
candidates = list(self.ngrams[context].items())
next_tok = random.choices(
[w for w, _ in candidates],
weights=[c for _, c in candidates]
)[0]
if next_tok == "</s>":
break
tokens.append(next_tok)
context = context[1:] + (next_tok,)
return " ".join(tokens)
corpus = [
"the cat sat on the mat and the dog sat on the rug",
"machine learning models learn from data and improve over time",
"deep learning uses neural networks with many layers",
"natural language processing handles text and speech data",
"data science involves statistics machine learning and programming",
]
model = NgramLM(n=3)
model.train(corpus)
print("Generated text (trigram LM):")
for i in range(4):
print(f" {i+1}: {model.generate(max_len=12, seed=i)}")
import random
from collections import defaultdict
import re
class MarkovChain:
def __init__(self, order=2):
self.order = order
self.chain = defaultdict(list)
self.starts = []
def train(self, text):
words = re.findall(r"\w+[.,!?]?", text.lower())
if len(words) < self.order + 1:
return
self.starts.append(tuple(words[:self.order]))
for i in range(len(words) - self.order):
key = tuple(words[i:i+self.order])
self.chain[key].append(words[i+self.order])
def generate(self, n_words=30, seed=42):
random.seed(seed)
if not self.starts:
return ""
state = random.choice(self.starts)
result = list(state)
for _ in range(n_words - self.order):
if state not in self.chain:
break
next_word = random.choice(self.chain[state])
result.append(next_word)
state = tuple(result[-self.order:])
return " ".join(result).capitalize()
mc = MarkovChain(order=2)
training_data = [
"Data science combines statistics, machine learning, and domain expertise to extract insights from data.",
"Machine learning models learn patterns from training data and generalize to new examples.",
"Deep learning architectures with many layers can learn hierarchical representations.",
"Natural language processing techniques enable machines to understand and generate human language.",
]
for text in training_data:
mc.train(text)
print("Markov Chain Generated Text:")
for seed in [1, 2, 3]:
print(f" Seed {seed}: {mc.generate(n_words=20, seed=seed)}")
import random
from collections import defaultdict
import re
# Template-based + Markov hybrid generator for game content
class GameTextGenerator:
def __init__(self, order=2):
self.order = order
self.chain = defaultdict(list)
self.starts = []
self.templates = {
"quest": [
"Retrieve the {item} from {location} and return to {npc}.",
"Defeat the {enemy} that threatens {location}.",
"Escort {npc} safely through {location} to {destination}.",
"Discover the secrets of {location} by finding {item}.",
],
"item": [
"Ancient {adj} {noun} of {attribute}",
"{adj} {noun} Forged in {location}",
"The {npc}'s Sacred {noun}",
]
}
self.vocab = {
"item": ["Sword", "Amulet", "Tome", "Crystal", "Shield", "Ring"],
"location": ["Dark Forest", "Mountain Peak", "Sunken Temple", "Iron Citadel"],
"npc": ["Elder Mage", "Village Chief", "Wandering Merchant", "Oracle"],
"enemy": ["Shadow Drake", "Corrupted Knight", "Ancient Golem", "Bandit Lord"],
"adj": ["Cursed", "Sacred", "Ancient", "Enchanted", "Forgotten"],
"noun": ["Blade", "Tome", "Relic", "Seal", "Chalice"],
"attribute": ["Fire", "Ice", "Lightning", "Void", "Light"],
"destination": ["Capital City", "Hidden Sanctuary", "Mountain Fortress"],
}
def generate_from_template(self, template_type, seed=None):
if seed is not None:
random.seed(seed)
template = random.choice(self.templates[template_type])
result = template
for key, options in self.vocab.items():
placeholder = "{" + key + "}"
if placeholder in result:
result = result.replace(placeholder, random.choice(options))
return result
random.seed(42)
gen = GameTextGenerator()
print("Generated Quests:")
for i in range(4):
print(f" Quest {i+1}: {gen.generate_from_template('quest', seed=i)}")
print("\nGenerated Items:")
for i in range(4):
print(f" Item {i+1}: {gen.generate_from_template('item', seed=i+10)}")
import random
from collections import defaultdict
chain = defaultdict(list)
text = "the cat sat on the mat the cat ate the rat the rat ran away"
words = text.split()
for i in range(len(words)-1):
chain[words[i]].append(words[i+1])
random.seed(42)
word = "the"
result = [word]
for _ in range(10):
if word not in chain: break
word = random.choice(chain[word])
result.append(word)
print(" ".join(result))
A production NLP pipeline integrates preprocessing, vectorization, modeling, and post-processing into a reliable, scalable system.
import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
class TextPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, lowercase=True, remove_punct=True, remove_numbers=False):
self.lowercase = lowercase
self.remove_punct = remove_punct
self.remove_numbers = remove_numbers
def preprocess(self, text):
if self.lowercase:
text = text.lower()
if self.remove_punct:
text = re.sub(r"[^\w\s]", " ", text)
if self.remove_numbers:
text = re.sub(r"\d+", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def fit(self, X, y=None): return self
def transform(self, X): return [self.preprocess(t) for t in X]
# Sample dataset
texts = [
"Great product, fast shipping, very satisfied!",
"Terrible quality, broke after one week.",
"Average item, nothing special.",
"Excellent! Exceeded all expectations!",
"Disappointed with the purchase.",
"Does the job, no complaints.",
"Best purchase I have made this year!",
"Waste of money, poor customer service.",
]
labels = ["pos", "neg", "neu", "pos", "neg", "neu", "pos", "neg"]
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)
pipeline = Pipeline([
("preprocessor", TextPreprocessor(lowercase=True, remove_punct=True)),
("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
("classifier", LogisticRegression(C=1.0, max_iter=1000, random_state=42)),
])
pipeline.fit(X_train, y_train)
print("Test accuracy:", round(pipeline.score(X_test, y_test), 4))
print(classification_report(y_test, pipeline.predict(X_test), zero_division=0))
# New predictions
new_texts = ["Amazing value!", "Completely broken on arrival.", "It works."]
for text, pred in zip(new_texts, pipeline.predict(new_texts)):
print(f" [{pred}] {text}")
import re
import numpy as np
from collections import deque
from datetime import datetime
class NLPProductionSystem:
def __init__(self, model, vectorizer):
self.model = model
self.vectorizer = vectorizer
self.request_log = deque(maxlen=1000)
self.prediction_counts = {}
self.error_rate = 0.0
def validate_input(self, text):
if not isinstance(text, str):
raise ValueError("Input must be a string")
if len(text.strip()) < 3:
raise ValueError("Input too short (minimum 3 chars)")
if len(text) > 10000:
raise ValueError("Input too long (maximum 10000 chars)")
# Check for injection-like patterns
if re.search(r"[<>{}|\\]", text):
text = re.sub(r"[<>{}|\\]", " ", text)
return text.strip()
def predict(self, text):
ts = datetime.now().isoformat()
try:
clean_text = self.validate_input(text)
# Simulate prediction
features = self.vectorizer.transform([clean_text])
pred = self.model.predict(features)[0]
prob = self.model.predict_proba(features).max()
# Log request
log_entry = {"ts": ts, "text_len": len(clean_text), "pred": pred, "prob": round(float(prob), 4)}
self.request_log.append(log_entry)
# Track prediction distribution
self.prediction_counts[pred] = self.prediction_counts.get(pred, 0) + 1
return {"prediction": pred, "confidence": round(float(prob), 4), "status": "ok"}
except Exception as e:
self.error_rate = (self.error_rate * len(self.request_log) + 1) / (len(self.request_log) + 1)
return {"error": str(e), "status": "error"}
def health_report(self):
total = len(self.request_log)
return {
"total_requests": total,
"prediction_distribution": self.prediction_counts,
"error_rate": round(self.error_rate, 4),
"avg_text_length": round(np.mean([r["text_len"] for r in self.request_log]) if self.request_log else 0, 1)
}
# Setup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
vec = TfidfVectorizer(ngram_range=(1,2))
texts = ["great product", "terrible quality", "okay item", "excellent service",
"bad experience", "good value", "poor quality", "fantastic result"]
labels = ["pos","neg","neu","pos","neg","pos","neg","pos"]
vec.fit(texts)
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(vec.transform(texts), labels)
system = NLPProductionSystem(model, vec)
test_inputs = ["Amazing product!", "", "a", "Works great!", "Very poor quality...", "It is fine I guess"]
for text in test_inputs:
result = system.predict(text)
print(f" Input: {repr(text):<40} -> {result}")
print("\nHealth Report:", system.health_report())
import re
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import json
# Intent classification dataset
intents = [
("I want to buy a new laptop", "purchase"),
("Can I get a refund for my order?", "refund"),
("How do I track my package?", "tracking"),
("I want to cancel my subscription", "cancel"),
("What is your return policy?", "policy"),
("Add item to my shopping cart", "purchase"),
("My order has not arrived yet", "tracking"),
("I would like my money back", "refund"),
("Stop my monthly plan", "cancel"),
("What are your shipping rules?", "policy"),
("Buy now and save 20%", "purchase"),
("Request a full refund please", "refund"),
("Where is my delivery?", "tracking"),
("I want to end my account", "cancel"),
("Tell me about your privacy policy", "policy"),
]
texts, labels = zip(*intents)
# Production pipeline
clf = Pipeline([
("tfidf", TfidfVectorizer(ngram_range=(1,2), sublinear_tf=True, max_features=5000)),
("lr", LogisticRegression(C=1.0, max_iter=1000, random_state=42)),
])
# Cross-validate (small dataset, so use all data for demo)
clf.fit(texts, labels)
# Test on new inputs
test_inputs = [
"I need to return this product",
"Where is my shipment?",
"Cancel my account immediately",
"I want to purchase this item",
"What are the terms of service?",
]
print("Intent Classification Results:")
for text in test_inputs:
pred = clf.predict([text])[0]
probs = clf.predict_proba([text])[0]
classes = clf.classes_
confidence = max(probs)
print(f" [{pred:<10}] ({confidence:.3f}) {text}")
# Prediction audit log
audit = []
for text in test_inputs:
pred = clf.predict([text])[0]
conf = float(max(clf.predict_proba([text])[0]))
needs_review = conf < 0.7
audit.append({"text": text[:30], "intent": pred, "confidence": round(conf, 3), "review": needs_review})
print("\nAudit Log:")
print(json.dumps(audit, indent=2))
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
train = [("buy now", "purchase"), ("cancel plan", "cancel"), ("track order", "tracking")]
texts, labels = zip(*train)
clf = Pipeline([("tfidf", TfidfVectorizer()), ("lr", LogisticRegression(max_iter=100))])
clf.fit(texts, labels)
print(clf.predict(["I want to stop my subscription"]))