Introduction
Python has become the dominant language for data analysis, largely because of its rich ecosystem of scientific computing libraries. These tools provide functionality similar to MATLAB — matrix operations, statistical functions, and visualization — but as open-source Python packages that integrate seamlessly with each other.
NumPy: The Foundation
NumPy is the core library for numerical computing in Python. It provides the ndarray — a fast, memory-efficient multi-dimensional array — and a comprehensive set of mathematical functions.
pip install numpy
import numpy as np
# Create arrays
a = np.array([1, 2, 3, 4, 5])
b = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
c = np.linspace(0, 1, 5) # [0.0, 0.25, 0.5, 0.75, 1.0]
m = np.zeros((3, 4)) # 3x4 matrix of zeros
r = np.random.randn(100) # 100 random normal values
# Vectorized operations (no loops needed)
print(a * 2) # => [2 4 6 8 10]
print(a ** 2) # => [1 4 9 16 25]
print(np.sqrt(a)) # => [1.0 1.41 1.73 2.0 2.24]
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
print(A @ B) # matrix multiplication
print(np.linalg.det(A)) # determinant
print(np.linalg.inv(A)) # inverse
# Statistics
data = np.random.normal(loc=50, scale=10, size=1000)
print(f"Mean: {data.mean():.2f}")
print(f"Std: {data.std():.2f}")
print(f"Min: {data.min():.2f}, Max: {data.max():.2f}")
pandas: Data Manipulation and Analysis
pandas is built on top of NumPy and provides DataFrame — a labeled, tabular data structure similar to a spreadsheet or SQL table.
pip install pandas
import pandas as pd
# Create a DataFrame
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
'age': [25, 30, 35, 28],
'salary': [70000, 85000, 95000, 78000],
'dept': ['Engineering', 'Marketing', 'Engineering', 'HR']
})
# Basic exploration
print(df.head())
print(df.describe())
print(df.dtypes)
print(df.shape) # => (4, 4)
# Selection
print(df['name']) # single column
print(df[['name', 'salary']]) # multiple columns
print(df[df['salary'] > 80000]) # filter rows
print(df.loc[0]) # row by label
print(df.iloc[0:2]) # rows by position
# Aggregation
print(df.groupby('dept')['salary'].mean())
print(df.groupby('dept').agg({'salary': ['mean', 'max', 'count']}))
# Sorting
print(df.sort_values('salary', ascending=False))
# Adding columns
df['bonus'] = df['salary'] * 0.1
df['senior'] = df['age'] > 30
# Read/write files
df.to_csv('employees.csv', index=False)
df.to_excel('employees.xlsx', index=False)
df2 = pd.read_csv('employees.csv')
Handling Missing Data
df = pd.DataFrame({
'a': [1, 2, None, 4],
'b': [5, None, 7, 8]
})
print(df.isnull().sum()) # count missing per column
df.dropna() # remove rows with any NaN
df.fillna(0) # fill NaN with 0
df['a'].fillna(df['a'].mean()) # fill with column mean
matplotlib: Foundational Plotting
matplotlib is the base plotting library. It produces publication-quality figures in many formats.
pip install matplotlib
import matplotlib.pyplot as plt
import numpy as np
# Line plot
x = np.linspace(0, 2 * np.pi, 100)
plt.figure(figsize=(10, 4))
plt.plot(x, np.sin(x), label='sin(x)', color='blue')
plt.plot(x, np.cos(x), label='cos(x)', color='red', linestyle='--')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Trigonometric Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('trig.png', dpi=150)
plt.show()
# Histogram
data = np.random.normal(50, 10, 1000)
plt.figure(figsize=(8, 5))
plt.hist(data, bins=30, edgecolor='black', color='steelblue', alpha=0.7)
plt.axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.1f}')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Normal Distribution')
plt.legend()
plt.show()
# Scatter plot
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5
plt.scatter(x, y, alpha=0.6, c=y, cmap='viridis')
plt.colorbar(label='y value')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter Plot with Color Mapping')
plt.show()
Subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('Sine')
axes[0, 1].plot(x, np.cos(x), color='orange')
axes[0, 1].set_title('Cosine')
axes[1, 0].hist(np.random.randn(500), bins=20)
axes[1, 0].set_title('Histogram')
axes[1, 1].scatter(np.random.randn(50), np.random.randn(50))
axes[1, 1].set_title('Scatter')
plt.tight_layout()
plt.show()
seaborn: Statistical Visualization
seaborn is built on matplotlib and provides a higher-level interface for statistical graphics. It integrates directly with pandas DataFrames.
pip install seaborn
import seaborn as sns
import pandas as pd
# Load a built-in dataset
tips = sns.load_dataset('tips')
# Distribution plot
sns.histplot(tips['total_bill'], kde=True)
plt.title('Distribution of Total Bill')
plt.show()
# Box plot
sns.boxplot(data=tips, x='day', y='total_bill', hue='sex')
plt.title('Total Bill by Day and Gender')
plt.show()
# Scatter with regression line
sns.regplot(data=tips, x='total_bill', y='tip')
plt.title('Tip vs Total Bill')
plt.show()
# Heatmap (correlation matrix)
corr = tips.select_dtypes(include='number').corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()
# Pair plot — all pairwise relationships
sns.pairplot(tips, hue='sex', diag_kind='kde')
plt.show()
Plotly: Interactive Charts
Plotly creates interactive, web-ready charts:
pip install plotly
import plotly.express as px
import plotly.graph_objects as go
# Interactive scatter plot
df = px.data.gapminder().query("year == 2007")
fig = px.scatter(
df, x='gdpPercap', y='lifeExp',
size='pop', color='continent',
hover_name='country',
log_x=True,
title='GDP vs Life Expectancy (2007)'
)
fig.show() # opens in browser
# Interactive line chart
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=np.sin(x), name='sin(x)', mode='lines'))
fig.add_trace(go.Scatter(x=x, y=np.cos(x), name='cos(x)', mode='lines'))
fig.update_layout(title='Interactive Trig Functions')
fig.show()
The Data Analysis Stack
NumPy — arrays, math, linear algebra
pandas — DataFrames, data wrangling, I/O
matplotlib — base plotting, full control
seaborn — statistical plots, prettier defaults
plotly — interactive charts for web/dashboards
scikit-learn — machine learning
scipy — scientific computing (stats, optimization, signal processing)
Quick Comparison
| Library | Best For | Learning Curve |
|---|---|---|
| NumPy | Array math, linear algebra | Medium |
| pandas | Data wrangling, tabular data | Medium |
| matplotlib | Any plot, full control | High |
| seaborn | Statistical plots, quick EDA | Low |
| Plotly | Interactive/web charts | Low-Medium |
Resources
- NumPy Documentation
- pandas Documentation
- matplotlib Documentation
- seaborn Documentation
- Plotly Python Documentation
- Python Data Science Handbook (free online)
Comments