Python Data Analysis Tools: NumPy, pandas, matplotlib, and More

Introduction

Python has become the dominant language for data analysis, largely because of its rich ecosystem of scientific computing libraries. These tools provide functionality similar to MATLAB — matrix operations, statistical functions, and visualization — but as open-source Python packages that integrate seamlessly with each other.

NumPy: The Foundation

NumPy is the core library for numerical computing in Python. It provides the ndarray — a fast, memory-efficient multi-dimensional array — and a comprehensive set of mathematical functions.

pip install numpy

import numpy as np

# Create arrays
a = np.array([1, 2, 3, 4, 5])
b = np.arange(0, 10, 2)          # [0, 2, 4, 6, 8]
c = np.linspace(0, 1, 5)         # [0.0, 0.25, 0.5, 0.75, 1.0]
m = np.zeros((3, 4))             # 3x4 matrix of zeros
r = np.random.randn(100)         # 100 random normal values

# Vectorized operations (no loops needed)
print(a * 2)          # => [2 4 6 8 10]
print(a ** 2)         # => [1 4 9 16 25]
print(np.sqrt(a))     # => [1.0 1.41 1.73 2.0 2.24]

# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
print(A @ B)          # matrix multiplication
print(np.linalg.det(A))  # determinant
print(np.linalg.inv(A))  # inverse

# Statistics
data = np.random.normal(loc=50, scale=10, size=1000)
print(f"Mean: {data.mean():.2f}")
print(f"Std:  {data.std():.2f}")
print(f"Min:  {data.min():.2f}, Max: {data.max():.2f}")

pandas: Data Manipulation and Analysis

pandas is built on top of NumPy and provides DataFrame — a labeled, tabular data structure similar to a spreadsheet or SQL table.

pip install pandas

import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'name':    ['Alice', 'Bob', 'Charlie', 'Diana'],
    'age':     [25, 30, 35, 28],
    'salary':  [70000, 85000, 95000, 78000],
    'dept':    ['Engineering', 'Marketing', 'Engineering', 'HR']
})

# Basic exploration
print(df.head())
print(df.describe())
print(df.dtypes)
print(df.shape)  # => (4, 4)

# Selection
print(df['name'])                    # single column
print(df[['name', 'salary']])        # multiple columns
print(df[df['salary'] > 80000])      # filter rows
print(df.loc[0])                     # row by label
print(df.iloc[0:2])                  # rows by position

# Aggregation
print(df.groupby('dept')['salary'].mean())
print(df.groupby('dept').agg({'salary': ['mean', 'max', 'count']}))

# Sorting
print(df.sort_values('salary', ascending=False))

# Adding columns
df['bonus'] = df['salary'] * 0.1
df['senior'] = df['age'] > 30

# Read/write files
df.to_csv('employees.csv', index=False)
df.to_excel('employees.xlsx', index=False)
df2 = pd.read_csv('employees.csv')

Handling Missing Data

df = pd.DataFrame({
    'a': [1, 2, None, 4],
    'b': [5, None, 7, 8]
})

print(df.isnull().sum())     # count missing per column
df.dropna()                  # remove rows with any NaN
df.fillna(0)                 # fill NaN with 0
df['a'].fillna(df['a'].mean())  # fill with column mean

matplotlib: Foundational Plotting

matplotlib is the base plotting library. It produces publication-quality figures in many formats.

pip install matplotlib

import matplotlib.pyplot as plt
import numpy as np

# Line plot
x = np.linspace(0, 2 * np.pi, 100)
plt.figure(figsize=(10, 4))
plt.plot(x, np.sin(x), label='sin(x)', color='blue')
plt.plot(x, np.cos(x), label='cos(x)', color='red', linestyle='--')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Trigonometric Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('trig.png', dpi=150)
plt.show()

# Histogram
data = np.random.normal(50, 10, 1000)
plt.figure(figsize=(8, 5))
plt.hist(data, bins=30, edgecolor='black', color='steelblue', alpha=0.7)
plt.axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.1f}')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Normal Distribution')
plt.legend()
plt.show()

# Scatter plot
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5
plt.scatter(x, y, alpha=0.6, c=y, cmap='viridis')
plt.colorbar(label='y value')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Scatter Plot with Color Mapping')
plt.show()

Subplots

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].plot(x, np.sin(x))
axes[0, 0].set_title('Sine')

axes[0, 1].plot(x, np.cos(x), color='orange')
axes[0, 1].set_title('Cosine')

axes[1, 0].hist(np.random.randn(500), bins=20)
axes[1, 0].set_title('Histogram')

axes[1, 1].scatter(np.random.randn(50), np.random.randn(50))
axes[1, 1].set_title('Scatter')

plt.tight_layout()
plt.show()

seaborn: Statistical Visualization

seaborn is built on matplotlib and provides a higher-level interface for statistical graphics. It integrates directly with pandas DataFrames.

pip install seaborn

import seaborn as sns
import pandas as pd

# Load a built-in dataset
tips = sns.load_dataset('tips')

# Distribution plot
sns.histplot(tips['total_bill'], kde=True)
plt.title('Distribution of Total Bill')
plt.show()

# Box plot
sns.boxplot(data=tips, x='day', y='total_bill', hue='sex')
plt.title('Total Bill by Day and Gender')
plt.show()

# Scatter with regression line
sns.regplot(data=tips, x='total_bill', y='tip')
plt.title('Tip vs Total Bill')
plt.show()

# Heatmap (correlation matrix)
corr = tips.select_dtypes(include='number').corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Pair plot — all pairwise relationships
sns.pairplot(tips, hue='sex', diag_kind='kde')
plt.show()

Plotly: Interactive Charts

Plotly creates interactive, web-ready charts:

pip install plotly

import plotly.express as px
import plotly.graph_objects as go

# Interactive scatter plot
df = px.data.gapminder().query("year == 2007")
fig = px.scatter(
    df, x='gdpPercap', y='lifeExp',
    size='pop', color='continent',
    hover_name='country',
    log_x=True,
    title='GDP vs Life Expectancy (2007)'
)
fig.show()  # opens in browser

# Interactive line chart
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=np.sin(x), name='sin(x)', mode='lines'))
fig.add_trace(go.Scatter(x=x, y=np.cos(x), name='cos(x)', mode='lines'))
fig.update_layout(title='Interactive Trig Functions')
fig.show()

The Data Analysis Stack

NumPy          — arrays, math, linear algebra
pandas         — DataFrames, data wrangling, I/O
matplotlib     — base plotting, full control
seaborn        — statistical plots, prettier defaults
plotly         — interactive charts for web/dashboards
scikit-learn   — machine learning
scipy          — scientific computing (stats, optimization, signal processing)

Quick Comparison

Library	Best For	Learning Curve
NumPy	Array math, linear algebra	Medium
pandas	Data wrangling, tabular data	Medium
matplotlib	Any plot, full control	High
seaborn	Statistical plots, quick EDA	Low
Plotly	Interactive/web charts	Low-Medium