Tutorials

Correlation Analysis with SciPy

Correlation analysis measures the strength and direction of association between variables. SciPy supports several correlation coefficients, each with different assumptions and use cases.

### Pearson Correlation

import numpy as np
from scipy import stats

np.random.seed(15)

x = np.random.normal(0, 1, 120)
y = 0.8 * x + np.random.normal(0, 0.5, 120)

r, p_value = stats.pearsonr(x, y)
print(f"Pearson r: {r:.3f}")
print(f"P-value: {p_value:.6f}")
Pearson r: 0.875
P-value: 0.000000
### Comparing Pearson, Spearman, and Kendall

import numpy as np
from scipy import stats

np.random.seed(15)

x = np.random.normal(0, 1, 120)
y = 0.8 * x + np.random.normal(0, 0.5, 120)

pearson = stats.pearsonr(x, y)
spearman = stats.spearmanr(x, y)
kendall = stats.kendalltau(x, y)

print(f"Pearson: r = {pearson.statistic:.3f}, p = {pearson.pvalue:.6f}")
print(f"Spearman: r = {spearman.statistic:.3f}, p = {spearman.pvalue:.6f}")
print(f"Kendall: tau = {kendall.statistic:.3f}, p = {kendall.pvalue:.6f}")
Pearson: r = 0.875, p = 0.000000
Spearman: r = 0.860, p = 0.000000
Kendall: tau = 0.684, p = 0.000000
### Scatter Plot of the Relationship

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

np.random.seed(15)

x = np.random.normal(0, 1, 120)
y = 0.8 * x + np.random.normal(0, 0.5, 120)
r, _ = stats.pearsonr(x, y)

plt.figure(figsize=(8, 5))
plt.scatter(x, y, alpha=0.6)
plt.xlabel("X")
plt.ylabel("Y")
plt.title(f"Scatter Plot (Pearson r = {r:.3f})")
plt.grid(alpha=0.3)
plt.show()
### Correlation Matrix

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(15)

x = np.random.normal(0, 1, 120)
y = 0.8 * x + np.random.normal(0, 0.5, 120)
z = np.random.normal(0, 1, 120)

matrix = np.corrcoef(np.column_stack([x, y, z]).T)

plt.figure(figsize=(6, 5))
plt.imshow(matrix, cmap="coolwarm", vmin=-1, vmax=1)
plt.xticks([0, 1, 2], ["x", "y", "z"])
plt.yticks([0, 1, 2], ["x", "y", "z"])
for i in range(3):
    for j in range(3):
        plt.text(j, i, f"{matrix[i, j]:.2f}", ha="center", va="center")
plt.colorbar(label="Correlation")
plt.title("Correlation Matrix")
plt.show()
### Practical Example: Study Time and Exam Score

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

np.random.seed(29)

study_hours = np.random.uniform(2, 12, 80)
exam_scores = 52 + 3.5 * study_hours + np.random.normal(0, 6, 80)

pearson = stats.pearsonr(study_hours, exam_scores)
spearman = stats.spearmanr(study_hours, exam_scores)

print(f"Pearson r: {pearson.statistic:.3f}, p-value: {pearson.pvalue:.6f}")
print(f"Spearman r: {spearman.statistic:.3f}, p-value: {spearman.pvalue:.6f}")

plt.figure(figsize=(8, 5))
plt.scatter(study_hours, exam_scores, alpha=0.65)
plt.xlabel("Study hours")
plt.ylabel("Exam score")
plt.title("Study Time vs Exam Score")
plt.grid(alpha=0.3)
plt.show()
Pearson r: 0.846, p-value: 0.000000
Spearman r: 0.847, p-value: 0.000000
### Conclusion

SciPy provides several correlation measures so you can match the statistic to the structure of your data. Combining the coefficient with a plot makes the relationship much easier to interpret.