Introduction to Descriptive Statistics

Descriptive statistics provide tools to summarize, organize, and describe data without making inferences about a larger population. They help us understand the basic features of our dataset through numerical summaries and visualizations.

Measures of Central Tendency

Central tendency describes the center or typical value of a dataset.

Mean (Arithmetic Average)

The mean is the sum of all values divided by the number of observations.



python

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Sample data: test scores
scores = [78, 85, 92, 76, 88, 94, 89, 82, 87, 91]

# Calculate mean
mean_score = np.mean(scores)
print(f"Mean: {mean_score:.2f}")

# Alternative calculation
mean_manual = sum(scores) / len(scores)
print(f"Mean (manual): {mean_manual:.2f}")

Median

The median is the middle value when data is arranged in order.



python

# Calculate median
median_score = np.median(scores)
print(f"Median: {median_score:.2f}")

# For even number of values, median is average of two middle values
sorted_scores = sorted(scores)
print(f"Sorted scores: {sorted_scores}")
print(f"Middle values: {sorted_scores[4]} and {sorted_scores[5]}")

Mode

The mode is the most frequently occurring value.



python

from statistics import mode, multimode

# Calculate mode
scores_with_duplicates = [78, 85, 92, 85, 88, 94, 89, 82, 85, 91]
mode_score = mode(scores_with_duplicates)
print(f"Mode: {mode_score}")

# For multiple modes
all_modes = multimode(scores_with_duplicates)
print(f"All modes: {all_modes}")

Measures of Variability

Variability measures describe how spread out the data points are.

Range

The difference between the maximum and minimum values.



python

# Calculate range
data_range = np.max(scores) - np.min(scores)
print(f"Range: {data_range}")
print(f"Min: {np.min(scores)}, Max: {np.max(scores)}")

Variance and Standard Deviation

Variance measures the average squared deviation from the mean.



python

# Calculate variance and standard deviation
variance = np.var(scores, ddof=1)  # ddof=1 for sample variance
std_deviation = np.std(scores, ddof=1)  # ddof=1 for sample std dev

print(f"Sample Variance: {variance:.2f}")
print(f"Sample Standard Deviation: {std_deviation:.2f}")

# Manual calculation of variance
mean_val = np.mean(scores)
manual_variance = sum((x - mean_val)**2 for x in scores) / (len(scores) - 1)
print(f"Manual Variance: {manual_variance:.2f}")

Interquartile Range (IQR)

The range between the 25th and 75th percentiles.



python

# Calculate quartiles and IQR
q1 = np.percentile(scores, 25)
q3 = np.percentile(scores, 75)
iqr = q3 - q1

print(f"Q1 (25th percentile): {q1}")
print(f"Q3 (75th percentile): {q3}")
print(f"IQR: {iqr}")

# Using numpy's quantile function
quartiles = np.quantile(scores, [0.25, 0.5, 0.75])
print(f"Quartiles: {quartiles}")

Shape of Distribution

Skewness

Measures the asymmetry of the distribution.



python

from scipy.stats import skew, kurtosis

# Calculate skewness
skewness = skew(scores)
print(f"Skewness: {skewness:.3f}")

if skewness > 0:
    print("Distribution is right-skewed (positive skew)")
elif skewness < 0:
    print("Distribution is left-skewed (negative skew)")
else:
    print("Distribution is symmetric")

Kurtosis

Measures the "tailedness" of the distribution.



python

# Calculate kurtosis
kurt = kurtosis(scores)
print(f"Kurtosis: {kurt:.3f}")

if kurt > 0:
    print("Distribution has heavy tails (leptokurtic)")
elif kurt < 0:
    print("Distribution has light tails (platykurtic)")
else:
    print("Distribution is normal-like (mesokurtic)")

Common Probability Distributions

Normal Distribution

The bell-shaped distribution that's fundamental in statistics.



python

# Generate normal distribution
mu, sigma = 85, 10  # mean and standard deviation
normal_data = np.random.normal(mu, sigma, 1000)

# Plot normal distribution
plt.figure(figsize=(10, 6))
plt.hist(normal_data, bins=30, density=True, alpha=0.7, color='skyblue')

# Overlay theoretical normal curve
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 100)
y = stats.norm.pdf(x, mu, sigma)
plt.plot(x, y, 'r-', linewidth=2, label='Theoretical Normal')

plt.title('Normal Distribution (μ=85, σ=10)')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Calculate probabilities
prob_below_80 = stats.norm.cdf(80, mu, sigma)
print(f"P(X < 80): {prob_below_80:.3f}")

Binomial Distribution

For discrete events with binary outcomes.



python

# Binomial distribution example: coin flips
n, p = 20, 0.5  # 20 trials, 50% probability of success

# Calculate probabilities
x_values = np.arange(0, n + 1)
binomial_probs = stats.binom.pmf(x_values, n, p)

# Plot binomial distribution
plt.figure(figsize=(10, 6))
plt.bar(x_values, binomial_probs, alpha=0.7, color='green')
plt.title(f'Binomial Distribution (n={n}, p={p})')
plt.xlabel('Number of Successes')
plt.ylabel('Probability')
plt.grid(True, alpha=0.3)
plt.show()

# Calculate specific probabilities
prob_exactly_10 = stats.binom.pmf(10, n, p)
prob_at_least_15 = 1 - stats.binom.cdf(14, n, p)
print(f"P(X = 10): {prob_exactly_10:.3f}")
print(f"P(X ≥ 15): {prob_at_least_15:.3f}")

Poisson Distribution

For rare events occurring over time or space.



python

# Poisson distribution example: customer arrivals
lambda_param = 3.5  # average rate

# Calculate probabilities
x_values = np.arange(0, 15)
poisson_probs = stats.poisson.pmf(x_values, lambda_param)

# Plot Poisson distribution
plt.figure(figsize=(10, 6))
plt.bar(x_values, poisson_probs, alpha=0.7, color='orange')
plt.title(f'Poisson Distribution (λ={lambda_param})')
plt.xlabel('Number of Events')
plt.ylabel('Probability')
plt.grid(True, alpha=0.3)
plt.show()

# Calculate probabilities
prob_exactly_5 = stats.poisson.pmf(5, lambda_param)
prob_less_than_3 = stats.poisson.cdf(2, lambda_param)
print(f"P(X = 5): {prob_exactly_5:.3f}")
print(f"P(X < 3): {prob_less_than_3:.3f}")

Practical Data Analysis Example



python

# Comprehensive analysis of a dataset
np.random.seed(42)
sales_data = np.random.normal(1000, 200, 100)  # 100 sales figures

# Descriptive statistics summary
print("Sales Data Analysis")
print("=" * 30)
print(f"Count: {len(sales_data)}")
print(f"Mean: ${np.mean(sales_data):.2f}")
print(f"Median: ${np.median(sales_data):.2f}")
print(f"Standard Deviation: ${np.std(sales_data, ddof=1):.2f}")
print(f"Range: ${np.ptp(sales_data):.2f}")
print(f"IQR: ${np.percentile(sales_data, 75) - np.percentile(sales_data, 25):.2f}")
print(f"Skewness: {skew(sales_data):.3f}")
print(f"Kurtosis: {kurtosis(sales_data):.3f}")

# Create comprehensive visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))

# Histogram
ax1.hist(sales_data, bins=20, density=True, alpha=0.7, color='lightblue')
ax1.set_title('Sales Distribution')
ax1.set_xlabel('Sales ($)')
ax1.set_ylabel('Density')

# Box plot
ax2.boxplot(sales_data)
ax2.set_title('Sales Box Plot')
ax2.set_ylabel('Sales ($)')

# Q-Q plot for normality check
stats.probplot(sales_data, dist="norm", plot=ax3)
ax3.set_title('Q-Q Plot (Normality Check)')

# Cumulative distribution
sorted_data = np.sort(sales_data)
cumulative_probs = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
ax4.plot(sorted_data, cumulative_probs, marker='.', linestyle='-')
ax4.set_title('Empirical Cumulative Distribution')
ax4.set_xlabel('Sales ($)')
ax4.set_ylabel('Cumulative Probability')

plt.tight_layout()
plt.show()

Key Takeaways

  1. Central tendency measures (mean, median, mode) describe the center of your data
  2. Variability measures (range, variance, standard deviation, IQR) describe data spread
  3. Shape measures (skewness, kurtosis) describe distribution characteristics
  4. Common distributions (normal, binomial, Poisson) model different types of data
  5. Visual analysis complements numerical summaries for complete understanding

Understanding these concepts is crucial for data analysis, as they form the foundation for more advanced statistical techniques and help you make informed decisions based on your data patterns.