Natural Language Processing with Transformers: Practical Guide

Natural Language Processing enables computers to understand and generate human language. Modern NLP is powered by transformer models like BERT, GPT, and specialized architectures for specific tasks.

Text Preprocessing Fundamentals

Tokenization

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

text = "Natural Language Processing is fascinating! It's used everywhere."

# Sentence tokenization
sentences = sent_tokenize(text)
print(f"Sentences: {sentences}")

# Word tokenization
words = word_tokenize(text)
print(f"Words: {words}")

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w.lower() not in stop_words and w.isalpha()]
print(f"Filtered words: {filtered_words}")

# Regex-based tokenization
custom_tokens = re.findall(r'\b\w+\b', text.lower())
print(f"Custom tokens: {custom_tokens}")

Text Normalization

import re
from nltk.stem import PorterStemmer, WordNetLemmatizer

text = "The running dogs are running quickly through the forest"

# Lowercasing
text_lower = text.lower()

# Remove punctuation
text_clean = re.sub(r'[^\w\s]', '', text_lower)

# Stemming (aggressive)
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in text_clean.split()]
print(f"Stemmed: {stemmed}")

# Lemmatization (more accurate)
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in text_clean.split()]
print(f"Lemmatized: {lemmatized}")

# Remove extra whitespace
text_normalized = ' '.join(text_clean.split())
print(f"Normalized: {text_normalized}")

Sentiment Analysis

Using Pre-trained Models

from transformers import pipeline

# Create sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

texts = [
    "I absolutely love this product!",
    "This is terrible and disappointing.",
    "It's okay, nothing special.",
    "Amazing quality and fast shipping!"
]

for text in texts:
    result = sentiment_analyzer(text)
    print(f"Text: {text}")
    print(f"Sentiment: {result[0]['label']}, Score: {result[0]['score']:.4f}\n")

Fine-tuning for Custom Sentiment

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Prepare training data
train_data = {
    'text': [
        "This product is amazing!",
        "Terrible quality",
        "Love it!",
        "Waste of money",
        "Excellent service",
        "Very disappointed"
    ],
    'label': [1, 0, 1, 0, 1, 0]  # 1: positive, 0: negative
}

# Create dataset
dataset = Dataset.from_dict(train_data)

# Load model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10,
    save_total_limit=2,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Train
# trainer.train()

Named Entity Recognition (NER)

from transformers import pipeline

# NER pipeline
ner_pipeline = pipeline("ner", aggregation_strategy="simple")

text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."

entities = ner_pipeline(text)
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Score: {entity['score']:.4f}")

# Output:
# Entity: Apple Inc., Type: ORG, Score: 0.9999
# Entity: Steve Jobs, Type: PER, Score: 0.9998
# Entity: Cupertino, Type: LOC, Score: 0.9997
# Entity: California, Type: LOC, Score: 0.9996

Text Classification

from transformers import pipeline

# Zero-shot classification
classifier = pipeline("zero-shot-classification")

text = "I love playing football and basketball"
candidate_labels = ["sports", "politics", "technology", "food"]

result = classifier(text, candidate_labels)
print(f"Text: {text}")
print(f"Classification: {result['labels'][0]} (score: {result['scores'][0]:.4f})")

# Multi-label classification
texts = [
    "This movie is funny and entertaining",
    "The weather is sunny and warm",
    "I'm sad and disappointed"
]

emotions = ["happy", "sad", "angry", "surprised"]

for text in texts:
    result = classifier(text, emotions, multi_class=True)
    print(f"Text: {text}")
    for label, score in zip(result['labels'], result['scores']):
        if score > 0.5:
            print(f"  {label}: {score:.4f}")

Question Answering

from transformers import pipeline

# QA pipeline
qa_pipeline = pipeline("question-answering")

context = """
The Great Wall of China is one of the most impressive structures ever built.
It stretches over 13,000 miles across northern China. Construction began in the 7th century BC
and continued for centuries. The wall was built to protect against invasions from northern nomads.
"""

questions = [
    "How long is the Great Wall of China?",
    "When was construction started?",
    "Why was the wall built?"
]

for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Q: {question}")
    print(f"A: {result['answer']} (confidence: {result['score']:.4f})\n")

Text Summarization

from transformers import pipeline

# Summarization pipeline
summarizer = pipeline("summarization")

text = """
Machine learning is a subset of artificial intelligence that focuses on the development
of algorithms and statistical models that enable computers to improve their performance
on tasks through experience. Unlike traditional programming where explicit instructions
are provided, machine learning systems learn patterns from data. Deep learning, a subset
of machine learning, uses neural networks with multiple layers to learn hierarchical
representations of data. This has led to breakthroughs in computer vision, natural language
processing, and many other domains.
"""

summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
print(f"Original length: {len(text.split())} words")
print(f"Summary: {summary[0]['summary_text']}")
print(f"Summary length: {len(summary[0]['summary_text'].split())} words")

Text Generation

from transformers import pipeline

# Text generation pipeline
generator = pipeline("text-generation", model="gpt2")

prompts = [
    "The future of artificial intelligence",
    "Machine learning is",
    "In the year 2050"
]

for prompt in prompts:
    result = generator(prompt, max_length=50, num_return_sequences=1)
    print(f"Prompt: {prompt}")
    print(f"Generated: {result[0]['generated_text']}\n")

Semantic Similarity

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

sentences = [
    "The cat is on the mat",
    "A feline rests on the rug",
    "The dog is playing in the park",
    "I love programming in Python"
]

# Get embeddings
embeddings = model.encode(sentences)

# Compute similarity matrix
similarity_matrix = cosine_similarity(embeddings)

print("Semantic Similarity Matrix:")
print(similarity_matrix)

# Find most similar sentences
query = "The cat is on the mat"
query_embedding = model.encode(query)

similarities = cosine_similarity([query_embedding], embeddings)[0]
most_similar_idx = np.argsort(similarities)[::-1][1]  # Exclude query itself

print(f"\nQuery: {query}")
print(f"Most similar: {sentences[most_similar_idx]}")
print(f"Similarity: {similarities[most_similar_idx]:.4f}")

Information Extraction

from transformers import pipeline

# Relation extraction (using zero-shot)
classifier = pipeline("zero-shot-classification")

text = "Apple Inc. was founded by Steve Jobs in 1976"

# Extract relationships
relations = [
    "founder",
    "location",
    "founding_year",
    "company_name"
]

result = classifier(text, relations)
print(f"Text: {text}")
print(f"Primary relation: {result['labels'][0]}")

Building a Complete NLP Pipeline

from transformers import pipeline
import re

class NLPPipeline:
    def __init__(self):
        self.sentiment = pipeline("sentiment-analysis")
        self.ner = pipeline("ner", aggregation_strategy="simple")
        self.qa = pipeline("question-answering")
    
    def analyze_text(self, text):
        """Comprehensive text analysis"""
        results = {}
        
        # Sentiment analysis
        sentiment = self.sentiment(text)
        results['sentiment'] = {
            'label': sentiment[0]['label'],
            'score': sentiment[0]['score']
        }
        
        # Named entity recognition
        entities = self.ner(text)
        results['entities'] = [
            {
                'text': e['word'],
                'type': e['entity_group'],
                'score': e['score']
            }
            for e in entities
        ]
        
        # Text statistics
        results['stats'] = {
            'word_count': len(text.split()),
            'char_count': len(text),
            'sentence_count': len(re.split(r'[.!?]+', text))
        }
        
        return results

# Usage
nlp = NLPPipeline()
text = "Apple Inc. released a new iPhone! The product is amazing."
analysis = nlp.analyze_text(text)

print("Analysis Results:")
print(f"Sentiment: {analysis['sentiment']}")
print(f"Entities: {analysis['entities']}")
print(f"Statistics: {analysis['stats']}")

Best Practices

Preprocess consistently: Apply same preprocessing to training and inference data
Use appropriate models: Choose models based on task and computational constraints
Handle out-of-vocabulary: Use subword tokenization (BPE, WordPiece)
Batch processing: Process multiple texts together for efficiency
Cache embeddings: Reuse embeddings for similar texts
Monitor performance: Track metrics on validation set

Common Pitfalls

Bad Practice:

# Don't: Use raw text without preprocessing
model.predict(raw_text)

# Don't: Process one text at a time
for text in texts:
    result = pipeline(text)  # Inefficient

# Don't: Ignore model limitations
result = pipeline(very_long_text)  # May exceed max length

Good Practice:

# Do: Preprocess text
text = text.lower().strip()
text = re.sub(r'[^\w\s]', '', text)

# Do: Batch process
results = pipeline(texts, batch_size=32)

# Do: Handle long texts
if len(text) > max_length:
    text = text[:max_length]

Conclusion

Modern NLP with transformers enables solving complex language understanding tasks. Leverage pre-trained models from Hugging Face for most applications, fine-tune for domain-specific tasks, and combine multiple NLP techniques for comprehensive text analysis. Understanding tokenization, embeddings, and transformer architectures enables building powerful NLP systems.