Natural Language Processing with Transformers: Practical Guide
Natural Language Processing enables computers to understand and generate human language. Modern NLP is powered by transformer models like BERT, GPT, and specialized architectures for specific tasks.
Text Preprocessing Fundamentals
Tokenization
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
text = "Natural Language Processing is fascinating! It's used everywhere."
# Sentence tokenization
sentences = sent_tokenize(text)
print(f"Sentences: {sentences}")
# Word tokenization
words = word_tokenize(text)
print(f"Words: {words}")
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w.lower() not in stop_words and w.isalpha()]
print(f"Filtered words: {filtered_words}")
# Regex-based tokenization
custom_tokens = re.findall(r'\b\w+\b', text.lower())
print(f"Custom tokens: {custom_tokens}")
Text Normalization
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
text = "The running dogs are running quickly through the forest"
# Lowercasing
text_lower = text.lower()
# Remove punctuation
text_clean = re.sub(r'[^\w\s]', '', text_lower)
# Stemming (aggressive)
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in text_clean.split()]
print(f"Stemmed: {stemmed}")
# Lemmatization (more accurate)
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in text_clean.split()]
print(f"Lemmatized: {lemmatized}")
# Remove extra whitespace
text_normalized = ' '.join(text_clean.split())
print(f"Normalized: {text_normalized}")
Sentiment Analysis
Using Pre-trained Models
from transformers import pipeline
# Create sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
texts = [
"I absolutely love this product!",
"This is terrible and disappointing.",
"It's okay, nothing special.",
"Amazing quality and fast shipping!"
]
for text in texts:
result = sentiment_analyzer(text)
print(f"Text: {text}")
print(f"Sentiment: {result[0]['label']}, Score: {result[0]['score']:.4f}\n")
Fine-tuning for Custom Sentiment
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
# Prepare training data
train_data = {
'text': [
"This product is amazing!",
"Terrible quality",
"Love it!",
"Waste of money",
"Excellent service",
"Very disappointed"
],
'label': [1, 0, 1, 0, 1, 0] # 1: positive, 0: negative
}
# Create dataset
dataset = Dataset.from_dict(train_data)
# Load model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Tokenize
def tokenize_function(examples):
return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=8,
save_steps=10,
save_total_limit=2,
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
)
# Train
# trainer.train()
Named Entity Recognition (NER)
from transformers import pipeline
# NER pipeline
ner_pipeline = pipeline("ner", aggregation_strategy="simple")
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
entities = ner_pipeline(text)
for entity in entities:
print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.4f}")
# Output:
# Entity: Apple Inc., Type: ORG, Score: 0.9999
# Entity: Steve Jobs, Type: PER, Score: 0.9998
# Entity: Cupertino, Type: LOC, Score: 0.9997
# Entity: California, Type: LOC, Score: 0.9996
Text Classification
from transformers import pipeline
# Zero-shot classification
classifier = pipeline("zero-shot-classification")
text = "I love playing football and basketball"
candidate_labels = ["sports", "politics", "technology", "food"]
result = classifier(text, candidate_labels)
print(f"Text: {text}")
print(f"Classification: {result['labels'][0]} (score: {result['scores'][0]:.4f})")
# Multi-label classification
texts = [
"This movie is funny and entertaining",
"The weather is sunny and warm",
"I'm sad and disappointed"
]
emotions = ["happy", "sad", "angry", "surprised"]
for text in texts:
result = classifier(text, emotions, multi_class=True)
print(f"Text: {text}")
for label, score in zip(result['labels'], result['scores']):
if score > 0.5:
print(f" {label}: {score:.4f}")
Question Answering
from transformers import pipeline
# QA pipeline
qa_pipeline = pipeline("question-answering")
context = """
The Great Wall of China is one of the most impressive structures ever built.
It stretches over 13,000 miles across northern China. Construction began in the 7th century BC
and continued for centuries. The wall was built to protect against invasions from northern nomads.
"""
questions = [
"How long is the Great Wall of China?",
"When was construction started?",
"Why was the wall built?"
]
for question in questions:
result = qa_pipeline(question=question, context=context)
print(f"Q: {question}")
print(f"A: {result['answer']} (confidence: {result['score']:.4f})\n")
Text Summarization
from transformers import pipeline
# Summarization pipeline
summarizer = pipeline("summarization")
text = """
Machine learning is a subset of artificial intelligence that focuses on the development
of algorithms and statistical models that enable computers to improve their performance
on tasks through experience. Unlike traditional programming where explicit instructions
are provided, machine learning systems learn patterns from data. Deep learning, a subset
of machine learning, uses neural networks with multiple layers to learn hierarchical
representations of data. This has led to breakthroughs in computer vision, natural language
processing, and many other domains.
"""
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
print(f"Original length: {len(text.split())} words")
print(f"Summary: {summary[0]['summary_text']}")
print(f"Summary length: {len(summary[0]['summary_text'].split())} words")
Text Generation
from transformers import pipeline
# Text generation pipeline
generator = pipeline("text-generation", model="gpt2")
prompts = [
"The future of artificial intelligence",
"Machine learning is",
"In the year 2050"
]
for prompt in prompts:
result = generator(prompt, max_length=50, num_return_sequences=1)
print(f"Prompt: {prompt}")
print(f"Generated: {result[0]['generated_text']}\n")
Semantic Similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = [
"The cat is on the mat",
"A feline rests on the rug",
"The dog is playing in the park",
"I love programming in Python"
]
# Get embeddings
embeddings = model.encode(sentences)
# Compute similarity matrix
similarity_matrix = cosine_similarity(embeddings)
print("Semantic Similarity Matrix:")
print(similarity_matrix)
# Find most similar sentences
query = "The cat is on the mat"
query_embedding = model.encode(query)
similarities = cosine_similarity([query_embedding], embeddings)[0]
most_similar_idx = np.argsort(similarities)[::-1][1] # Exclude query itself
print(f"\nQuery: {query}")
print(f"Most similar: {sentences[most_similar_idx]}")
print(f"Similarity: {similarities[most_similar_idx]:.4f}")
Information Extraction
from transformers import pipeline
# Relation extraction (using zero-shot)
classifier = pipeline("zero-shot-classification")
text = "Apple Inc. was founded by Steve Jobs in 1976"
# Extract relationships
relations = [
"founder",
"location",
"founding_year",
"company_name"
]
result = classifier(text, relations)
print(f"Text: {text}")
print(f"Primary relation: {result['labels'][0]}")
Building a Complete NLP Pipeline
from transformers import pipeline
import re
class NLPPipeline:
def __init__(self):
self.sentiment = pipeline("sentiment-analysis")
self.ner = pipeline("ner", aggregation_strategy="simple")
self.qa = pipeline("question-answering")
def analyze_text(self, text):
"""Comprehensive text analysis"""
results = {}
# Sentiment analysis
sentiment = self.sentiment(text)
results['sentiment'] = {
'label': sentiment[0]['label'],
'score': sentiment[0]['score']
}
# Named entity recognition
entities = self.ner(text)
results['entities'] = [
{
'text': e['word'],
'type': e['entity_group'],
'score': e['score']
}
for e in entities
]
# Text statistics
results['stats'] = {
'word_count': len(text.split()),
'char_count': len(text),
'sentence_count': len(re.split(r'[.!?]+', text))
}
return results
# Usage
nlp = NLPPipeline()
text = "Apple Inc. released a new iPhone! The product is amazing."
analysis = nlp.analyze_text(text)
print("Analysis Results:")
print(f"Sentiment: {analysis['sentiment']}")
print(f"Entities: {analysis['entities']}")
print(f"Statistics: {analysis['stats']}")
Best Practices
- Preprocess consistently: Apply same preprocessing to training and inference data
- Use appropriate models: Choose models based on task and computational constraints
- Handle out-of-vocabulary: Use subword tokenization (BPE, WordPiece)
- Batch processing: Process multiple texts together for efficiency
- Cache embeddings: Reuse embeddings for similar texts
- Monitor performance: Track metrics on validation set
Common Pitfalls
Bad Practice:
# Don't: Use raw text without preprocessing
model.predict(raw_text)
# Don't: Process one text at a time
for text in texts:
result = pipeline(text) # Inefficient
# Don't: Ignore model limitations
result = pipeline(very_long_text) # May exceed max length
Good Practice:
# Do: Preprocess text
text = text.lower().strip()
text = re.sub(r'[^\w\s]', '', text)
# Do: Batch process
results = pipeline(texts, batch_size=32)
# Do: Handle long texts
if len(text) > max_length:
text = text[:max_length]
Conclusion
Modern NLP with transformers enables solving complex language understanding tasks. Leverage pre-trained models from Hugging Face for most applications, fine-tune for domain-specific tasks, and combine multiple NLP techniques for comprehensive text analysis. Understanding tokenization, embeddings, and transformer architectures enables building powerful NLP systems.
Comments