Introduction to Probability
Probability quantifies uncertainty and measures the likelihood of events occurring. It forms the mathematical foundation for statistics, machine learning, and data science.
Basic Probability Concepts
Sample Space and Events
- Sample Space (S): The set of all possible outcomes
- Event (E): A subset of the sample space
python
import numpy as np
import matplotlib.pyplot as plt
from fractions import Fraction
import pandas as pd
# Example: Rolling a die
sample_space = {1, 2, 3, 4, 5, 6}
event_even = {2, 4, 6}
event_greater_than_4 = {5, 6}
print(f"Sample Space: {sample_space}")
print(f"Event (Even numbers): {event_even}")
print(f"Event (Greater than 4): {event_greater_than_4}")
# Probability calculation
prob_even = len(event_even) / len(sample_space)
print(f"P(Even) = {prob_even} = {Fraction(len(event_even), len(sample_space))}")
Probability Rules
Rule 1: Probability Range 0 ≤ P(E) ≤ 1 for any event E
Rule 2: Total Probability P(S) = 1 (probability of sample space is 1)
Rule 3: Complement Rule P(E') = 1 - P(E), where E' is the complement of E
python
# Demonstrating probability rules
def demonstrate_probability_rules():
    # Simulate coin flips
    n_trials = 10000
    coin_flips = np.random.choice(['H', 'T'], size=n_trials)
    
    # Count heads
    heads_count = np.sum(coin_flips == 'H')
    prob_heads = heads_count / n_trials
    prob_tails = 1 - prob_heads
    
    print(f"Simulated P(Heads) = {prob_heads:.3f}")
    print(f"Simulated P(Tails) = {prob_tails:.3f}")
    print(f"Sum of probabilities = {prob_heads + prob_tails:.3f}")
    
    return coin_flips
coin_data = demonstrate_probability_rules()
Set Operations and Probability
Union, Intersection, and Complement
python
# Set operations with dice example
die1_outcomes = {1, 2, 3, 4, 5, 6}
die2_outcomes = {1, 2, 3, 4, 5, 6}
# Events for sum of two dice
def get_sum_outcomes():
    outcomes = []
    for d1 in die1_outcomes:
        for d2 in die2_outcomes:
            outcomes.append((d1, d2, d1 + d2))
    return outcomes
all_outcomes = get_sum_outcomes()
total_outcomes = len(all_outcomes)
# Event A: Sum equals 7
event_sum_7 = [(d1, d2, s) for d1, d2, s in all_outcomes if s == 7]
# Event B: First die shows 3
event_first_3 = [(d1, d2, s) for d1, d2, s in all_outcomes if d1 == 3]
print(f"P(Sum = 7) = {len(event_sum_7)}/{total_outcomes} = {len(event_sum_7)/total_outcomes:.3f}")
print(f"P(First die = 3) = {len(event_first_3)}/{total_outcomes} = {len(event_first_3)/total_outcomes:.3f}")
# Union: A ∪ B
union_events = set(event_sum_7 + event_first_3)
print(f"P(Sum = 7 OR First die = 3) = {len(union_events)}/{total_outcomes} = {len(union_events)/total_outcomes:.3f}")
# Intersection: A ∩ B
intersection_events = [outcome for outcome in event_sum_7 if outcome in event_first_3]
print(f"P(Sum = 7 AND First die = 3) = {len(intersection_events)}/{total_outcomes} = {len(intersection_events)/total_outcomes:.3f}")
Conditional Probability
Conditional probability measures the probability of an event given that another event has occurred.
P(A|B) = P(A ∩ B) / P(B), provided P(B) > 0
python
# Medical diagnosis example
def medical_diagnosis_example():
    # Population data
    total_population = 10000
    
    # Disease prevalence
    has_disease = 100  # 1% of population
    no_disease = total_population - has_disease
    
    # Test characteristics
    # Sensitivity: P(Test+ | Disease+) = 0.95
    # Specificity: P(Test- | Disease-) = 0.90
    
    sensitivity = 0.95
    specificity = 0.90
    
    # Calculate test results
    true_positive = int(has_disease * sensitivity)  # 95
    false_negative = has_disease - true_positive    # 5
    
    true_negative = int(no_disease * specificity)   # 8910
    false_positive = no_disease - true_negative     # 990
    
    # Create confusion matrix
    confusion_matrix = pd.DataFrame({
        'Disease+': [true_positive, false_negative, has_disease],
        'Disease-': [false_positive, true_negative, no_disease],
        'Total': [true_positive + false_positive, false_negative + true_negative, total_population]
    }, index=['Test+', 'Test-', 'Total'])
    
    print("Medical Test Confusion Matrix:")
    print(confusion_matrix)
    print()
    
    # Calculate conditional probabilities
    prob_disease_given_positive = true_positive / (true_positive + false_positive)
    prob_no_disease_given_negative = true_negative / (true_negative + false_negative)
    
    print(f"P(Disease | Test+) = {prob_disease_given_positive:.3f}")
    print(f"P(No Disease | Test-) = {prob_no_disease_given_negative:.3f}")
    
    return confusion_matrix
medical_data = medical_diagnosis_example()
Independence
Two events A and B are independent if P(A|B) = P(A) or equivalently P(A ∩ B) = P(A) × P(B).
python
# Testing independence with dice rolls
def test_independence():
    n_simulations = 100000
    
    # Simulate rolling two dice
    die1_rolls = np.random.randint(1, 7, n_simulations)
    die2_rolls = np.random.randint(1, 7, n_simulations)
    
    # Event A: First die shows 6
    event_a = (die1_rolls == 6)
    prob_a = np.mean(event_a)
    
    # Event B: Second die shows even number
    event_b = (die2_rolls % 2 == 0)
    prob_b = np.mean(event_b)
    
    # Joint event: A and B
    joint_event = event_a & event_b
    prob_joint = np.mean(joint_event)
    
    # Expected probability if independent
    expected_joint = prob_a * prob_b
    
    print(f"P(First die = 6) = {prob_a:.3f}")
    print(f"P(Second die even) = {prob_b:.3f}")
    print(f"P(Both events) = {prob_joint:.3f}")
    print(f"Expected if independent = {expected_joint:.3f}")
    print(f"Difference = {abs(prob_joint - expected_joint):.4f}")
    
    # Test statistical independence
    if abs(prob_joint - expected_joint) < 0.01:
        print("Events appear to be independent!")
    else:
        print("Events appear to be dependent!")
test_independence()
Bayes' Theorem
Bayes' theorem relates conditional probabilities and is fundamental to Bayesian statistics.
Formula: P(A|B) = P(B|A) × P(A) / P(B)
Where:
- P(A|B): Posterior probability
- P(B|A): Likelihood
- P(A): Prior probability
- P(B): Evidence
python
# Spam email classification example
def spam_classification_bayes():
    # Prior probabilities
    prob_spam = 0.3      # 30% of emails are spam
    prob_ham = 0.7       # 70% of emails are legitimate
    
    # Likelihoods (probability of word "FREE" appearing)
    prob_free_given_spam = 0.8   # 80% of spam emails contain "FREE"
    prob_free_given_ham = 0.1    # 10% of ham emails contain "FREE"
    
    # Calculate evidence P(FREE)
    prob_free = (prob_free_given_spam * prob_spam + 
                 prob_free_given_ham * prob_ham)
    
    # Apply Bayes' theorem
    prob_spam_given_free = (prob_free_given_spam * prob_spam) / prob_free
    prob_ham_given_free = (prob_free_given_ham * prob_ham) / prob_free
    
    print("Spam Email Classification using Bayes' Theorem")
    print("=" * 50)
    print(f"Prior P(Spam) = {prob_spam}")
    print(f"Prior P(Ham) = {prob_ham}")
    print(f"Likelihood P(FREE|Spam) = {prob_free_given_spam}")
    print(f"Likelihood P(FREE|Ham) = {prob_free_given_ham}")
    print(f"Evidence P(FREE) = {prob_free:.3f}")
    print()
    print("Posterior Probabilities:")
    print(f"P(Spam|FREE) = {prob_spam_given_free:.3f}")
    print(f"P(Ham|FREE) = {prob_ham_given_free:.3f}")
    
    return prob_spam_given_free
spam_prob = spam_classification_bayes()
Bayesian Updating
Demonstrate how beliefs update with new evidence.
python
# Coin fairness testing
def bayesian_coin_testing():
    # Prior belief about coin fairness
    # Let's assume we think the coin might be biased toward heads
    prior_p_heads = np.linspace(0, 1, 1001)  # Possible values for P(heads)
    
    # Uniform prior (no initial belief)
    prior = np.ones_like(prior_p_heads)
    prior = prior / np.sum(prior)  # Normalize
    
    # Observed data: sequence of coin flips
    observations = ['H', 'T', 'H', 'H', 'T', 'H', 'H', 'H', 'T', 'H']
    
    # Bayesian updating
    posterior = prior.copy()
    
    plt.figure(figsize=(12, 8))
    
    for i, obs in enumerate(observations):
        # Calculate likelihood for each possible P(heads)
        if obs == 'H':
            likelihood = prior_p_heads
        else:
            likelihood = 1 - prior_p_heads
        
        # Update posterior: posterior ∝ likelihood × prior
        posterior = posterior * likelihood
        posterior = posterior / np.sum(posterior)  # Normalize
        
        # Plot every few updates
        if (i + 1) in [1, 3, 5, 10]:
            plt.plot(prior_p_heads, posterior, 
                    label=f'After {i+1} flips', linewidth=2)
    
    plt.axvline(x=0.5, color='red', linestyle='--', alpha=0.7, label='Fair coin')
    plt.xlabel('P(Heads)')
    plt.ylabel('Posterior Probability Density')
    plt.title('Bayesian Updating of Coin Bias Belief')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Calculate posterior statistics
    posterior_mean = np.sum(prior_p_heads * posterior)
    posterior_std = np.sqrt(np.sum((prior_p_heads - posterior_mean)**2 * posterior))
    
    print(f"Observations: {observations}")
    print(f"Heads count: {observations.count('H')}/{len(observations)}")
    print(f"Posterior mean P(heads): {posterior_mean:.3f}")
    print(f"Posterior std: {posterior_std:.3f}")
    
    return posterior_mean
coin_bias = bayesian_coin_testing()
Real-World Application: A/B Testing
python
# A/B testing with Bayesian approach
def bayesian_ab_testing():
    # Simulate A/B test data
    np.random.seed(42)
    
    # Version A: 1000 visitors, 50 conversions
    visitors_a = 1000
    conversions_a = 50
    
    # Version B: 1000 visitors, 65 conversions
    visitors_b = 1000
    conversions_b = 65
    
    # Beta-binomial conjugate prior (uniform: Beta(1,1))
    alpha_prior = 1
    beta_prior = 1
    
    # Posterior parameters (Beta distribution)
    alpha_a = alpha_prior + conversions_a
    beta_a = beta_prior + visitors_a - conversions_a
    
    alpha_b = alpha_prior + conversions_b
    beta_b = beta_prior + visitors_b - conversions_b
    
    # Generate posterior samples
    samples_a = np.random.beta(alpha_a, beta_a, 10000)
    samples_b = np.random.beta(alpha_b, beta_b, 10000)
    
    # Calculate probability that B > A
    prob_b_better = np.mean(samples_b > samples_a)
    
    # Plot posterior distributions
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.hist(samples_a, bins=50, alpha=0.7, label='Version A', density=True)
    plt.hist(samples_b, bins=50, alpha=0.7, label='Version B', density=True)
    plt.xlabel('Conversion Rate')
    plt.ylabel('Density')
    plt.title('Posterior Distributions')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    difference = samples_b - samples_a
    plt.hist(difference, bins=50, alpha=0.7, color='green')
    plt.axvline(x=0, color='red', linestyle='--', label='No difference')
    plt.xlabel('Difference in Conversion Rate (B - A)')
    plt.ylabel('Density')
    plt.title('Distribution of Difference')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("A/B Testing Results:")
    print("=" * 20)
    print(f"Version A: {conversions_a}/{visitors_a} = {conversions_a/visitors_a:.3f}")
    print(f"Version B: {conversions_b}/{visitors_b} = {conversions_b/visitors_b:.3f}")
    print(f"Probability B > A: {prob_b_better:.3f}")
    print(f"95% Credible interval for difference: [{np.percentile(difference, 2.5):.4f}, {np.percentile(difference, 97.5):.4f}]")
bayesian_ab_testing()
Key Takeaways
- Probability fundamentals provide the mathematical framework for uncertainty
- Conditional probability helps us update beliefs given new information
- Independence is crucial for many statistical methods and assumptions
- Bayes' theorem enables systematic belief updating and decision making
- Bayesian methods offer powerful tools for data analysis and inference
Understanding these concepts is essential for advanced statistics, machine learning, and data-driven decision making.