4.scipy Stats


title: "4.scipy-stats" date: 2026-05-24T14:11:12Z

1import numpy as np
2import pylab as pl
3from scipy import stats
1import matplotlib.pyplot as plt
2import matplotlib as mpl
3mpl.rcParams['font.sans-serif'] = ['SimHei']

统计-stats

连续概率分布

1from scipy import stats
2[k for k, v in stats.__dict__.items() if isinstance(v, stats.rv_continuous)]
['ksone',
 'kstwobign',
 'norm',
 'alpha',
 'anglit',
 'arcsine',
 'beta',
 'betaprime',
 'bradford',
 'burr',
 'burr12',
 'fisk',
 'cauchy',
 'chi',
 'chi2',
 'cosine',
 'dgamma',
 'dweibull',
 'expon',
 'exponnorm',
 'exponweib',
 'exponpow',
 'fatiguelife',
 'foldcauchy',
 'f',
 'foldnorm',
 'weibull_min',
 'weibull_max',
 'frechet_r',
 'frechet_l',
 'genlogistic',
 'genpareto',
 'genexpon',
 'genextreme',
 'gamma',
 'erlang',
 'gengamma',
 'genhalflogistic',
 'gompertz',
 'gumbel_r',
 'gumbel_l',
 'halfcauchy',
 'halflogistic',
 'halfnorm',
 'hypsecant',
 'gausshyper',
 'invgamma',
 'invgauss',
 'invweibull',
 'johnsonsb',
 'johnsonsu',
 'laplace',
 'levy',
 'levy_l',
 'levy_stable',
 'logistic',
 'loggamma',
 'loglaplace',
 'lognorm',
 'gilbrat',
 'maxwell',
 'mielke',
 'kappa4',
 'kappa3',
 'nakagami',
 'ncx2',
 'ncf',
 't',
 'nct',
 'pareto',
 'lomax',
 'pearson3',
 'powerlaw',
 'powerlognorm',
 'powernorm',
 'rdist',
 'rayleigh',
 'reciprocal',
 'rice',
 'recipinvgauss',
 'semicircular',
 'skewnorm',
 'trapz',
 'triang',
 'truncexpon',
 'truncnorm',
 'tukeylambda',
 'uniform',
 'vonmises',
 'vonmises_line',
 'wald',
 'wrapcauchy',
 'gennorm',
 'halfgennorm',
 'crystalball',
 'argus']
1stats.norm.stats()
(array(0.), array(1.))
1X = stats.norm(loc=1.0, scale=2.0)
2X.stats()
(array(1.), array(4.))
1x = X.rvs(size=10000) # 对随机变量取10000个值
2np.mean(x), np.var(x) # 期望值和方差
(1.0048352738823323, 3.9372117720073554)
1stats.norm.fit(x) # 得到随机序列期望值和标准差
(1.0048352738823323, 1.984240855341749)
1pdf, t = np.histogram(x, bins=100, normed=True)  #❶
2t = (t[:-1] + t[1:]) * 0.5  #❷
3cdf = np.cumsum(pdf) * (t[1] - t[0])  #❸
4p_error = pdf - X.pdf(t)
5c_error = cdf - X.cdf(t)
6print ("max pdf error: {}, max cdf error: {}".format(
7    np.abs(p_error).max(),
8    np.abs(c_error).max()))
max pdf error: 0.018998755595167102, max cdf error: 0.018503342378306975


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: VisibleDeprecationWarning: Passing `normed=True` on non-uniform bins has always been broken, and computes neither the probability density function nor the probability mass function. The result is only correct if the bins are uniform, when density=True will produce the same result anyway. The argument will be removed in a future version of numpy.
  """Entry point for launching an IPython kernel.
1#%figonly=正态分布的概率密度函数(左)和累积分布函数(右)
2fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7, 2))
3ax1.plot(t, pdf, label=u"统计值")
4ax1.plot(t, X.pdf(t), label=u"理论值", alpha=0.6)
5ax1.legend(loc="best")
6ax2.plot(t, cdf)
7ax2.plot(t, X.cdf(t), alpha=0.6); 

png

1print(stats.gamma.stats(1.0))
2print(stats.gamma.stats(2.0))
(array(1.), array(1.))
(array(2.), array(2.))
1stats.gamma.stats(2.0, scale=2)
(array(4.), array(8.))
1x = stats.gamma.rvs(2, scale=2, size=4)
2x   
array([4.40563983, 6.17699951, 3.65503843, 3.28052152])
1stats.gamma.pdf(x, 2, scale=2)
array([0.12169605, 0.07037188, 0.14694352, 0.15904745])
1X = stats.gamma(2, scale=2) 
2X.pdf(x)
array([0.12169605, 0.07037188, 0.14694352, 0.15904745])

离散概率分布

1x = range(1, 7)    
2p = (0.4, 0.2, 0.1, 0.1, 0.1, 0.1)
1dice = stats.rv_discrete(values=(x, p))
2dice.rvs(size=20)
array([2, 5, 2, 6, 1, 6, 6, 5, 3, 1, 5, 2, 1, 1, 1, 1, 1, 2, 1, 6])
1np.random.seed(42)
2samples = dice.rvs(size=(20000, 50))
3samples_mean = np.mean(samples, axis=1)

核密度估计

1#%fig=核密度估计能更准确地表示随机变量的概率密度函数
2_, bins, step = pl.hist(
3    samples_mean, bins=100, normed=True, histtype="step", label=u"直方图统计")
4kde = stats.kde.gaussian_kde(samples_mean)
5x = np.linspace(bins[0], bins[-1], 100)
6pl.plot(x, kde(x), label=u"核密度估计")
7mean, std = stats.norm.fit(samples_mean)
8pl.plot(x, stats.norm(mean, std).pdf(x), alpha=0.8, label=u"正态分布拟合")
9pl.legend()
<matplotlib.legend.Legend at 0x20556d5f5c0>

png

1#%fig=`bw_method`参数越大核密度估计曲线越平滑
2for bw in [0.2, 0.3, 0.6, 1.0]:
3    kde = stats.gaussian_kde([-1, 0, 1], bw_method=bw)
4    x = np.linspace(-5, 5, 1000)
5    y = kde(x)
6    pl.plot(x, y, lw=2, label="bw={}".format(bw), alpha=0.6)
7pl.legend(loc="best");

png

二项、泊松、伽玛分布

1stats.binom.pmf(range(6), 5, 1/6.0)
array([4.01877572e-01, 4.01877572e-01, 1.60751029e-01, 3.21502058e-02,
       3.21502058e-03, 1.28600823e-04])
 1#%fig=当n足够大时二项分布和泊松分布近似相等
 2lambda_ = 10.0
 3x = np.arange(20)
 4
 5n1, n2 = 100, 1000
 6
 7y_binom_n1 = stats.binom.pmf(x, n1, lambda_ / n1)
 8y_binom_n2 = stats.binom.pmf(x, n2, lambda_ / n2)
 9y_poisson = stats.poisson.pmf(x, lambda_)
10print(np.max(np.abs(y_binom_n1 - y_poisson)))
11print(np.max(np.abs(y_binom_n2 - y_poisson)))
12#%hide
13fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
14
15ax1.plot(x, y_binom_n1, label=u"binom", lw=2)
16ax1.plot(x, y_poisson, label=u"poisson", lw=2, color="red")
17ax2.plot(x, y_binom_n2, label=u"binom", lw=2)
18ax2.plot(x, y_poisson, label=u"poisson", lw=2, color="red")
19for n, ax in zip((n1, n2), (ax1, ax2)):
20    ax.set_xlabel(u"次数")
21    ax.set_ylabel(u"概率")
22    ax.set_title("n={}".format(n))
23    ax.legend()
24fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1)
0.00675531110335309
0.0006301754049777564

png

 1#%fig=模拟泊松分布
 2np.random.seed(42)
 3
 4
 5def sim_poisson(lambda_, time):
 6    t = np.random.uniform(0, time, size=lambda_ * time)  #❶
 7    count, time_edges = np.histogram(t, bins=time, range=(0, time))  #❷
 8    dist, count_edges = np.histogram(
 9        count, bins=20, range=(0, 20), density=True)  #❸
10    x = count_edges[:-1]
11    poisson = stats.poisson.pmf(x, lambda_)
12    return x, poisson, dist
13
14
15lambda_ = 10
16times = 1000, 50000
17x1, poisson1, dist1 = sim_poisson(lambda_, times[0])
18x2, poisson2, dist2 = sim_poisson(lambda_, times[1])
19max_error1 = np.max(np.abs(dist1 - poisson1))
20max_error2 = np.max(np.abs(dist2 - poisson2))
21print("time={}, max_error={}".format(times[0], max_error1))
22print("time={}, max_error={}".format(times[1], max_error2))
23#%hide
24fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
25
26ax1.plot(x1, dist1, "-o", lw=2, label=u"统计结果")
27ax1.plot(x1, poisson1, "->", lw=2, label=u"泊松分布", color="red", alpha=0.6)
28ax2.plot(x2, dist2, "-o", lw=2, label=u"统计结果")
29ax2.plot(x2, poisson2, "->", lw=2, label=u"泊松分布", color="red", alpha=0.6)
30
31for ax, time in zip((ax1, ax2), times):
32    ax.set_xlabel(u"次数")
33    ax.set_ylabel(u"概率")
34    ax.set_title(u"time = {}".format(time))
35    ax.legend(loc="lower center")
36
37fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1)
time=1000, max_error=0.01964230201602718
time=50000, max_error=0.001798012894964722

png

 1#%fig=模拟伽玛分布
 2def sim_gamma(lambda_, time, k):
 3    t = np.random.uniform(0, time, size=lambda_ * time) #❶
 4    t.sort()  #❷
 5    interval = t[k:] - t[:-k] #❸
 6    dist, interval_edges = np.histogram(interval, bins=100, density=True) #❹
 7    x = (interval_edges[1:] + interval_edges[:-1])/2  #❺
 8    gamma = stats.gamma.pdf(x, k, scale=1.0/lambda_) #❺
 9    return x, gamma, dist
10
11lambda_ = 10
12time = 1000
13ks = 1, 2
14x1, gamma1, dist1 = sim_gamma(lambda_, time, ks[0])
15x2, gamma2, dist2 = sim_gamma(lambda_, time, ks[1])
16#%hide
17fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
18
19ax1.plot(x1, dist1,  lw=2, label=u"统计结果")
20ax1.plot(x1, gamma1, lw=2, label=u"伽玛分布", color="red", alpha=0.6)
21ax2.plot(x2, dist2,  lw=2, label=u"统计结果")
22ax2.plot(x2, gamma2, lw=2, label=u"伽玛分布", color="red", alpha=0.6)
23
24for ax, k in zip((ax1, ax2), ks):
25    ax.set_xlabel(u"时间间隔")
26    ax.set_ylabel(u"概率密度")
27    ax.set_title(u"k = {}".format(k))
28    ax.legend(loc="upper right")
29    
30fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1);

png

 1T = 100000
 2A_count = int(T / 5)
 3B_count = int(T / 10)
 4
 5A_time = np.random.uniform(0, T, A_count) #❶
 6B_time = np.random.uniform(0, T, B_count)
 7
 8bus_time = np.concatenate((A_time, B_time)) #❷
 9bus_time.sort()
10
11N = 200000
12passenger_time = np.random.uniform(bus_time[0], bus_time[-1], N) #❸
13
14idx = np.searchsorted(bus_time, passenger_time) #❹
15np.mean(bus_time[idx] - passenger_time) * 60    #❺
202.3388747879705
1np.mean(np.diff(bus_time)) * 60
199.99833251643057
 1#%figonly=观察者偏差
 2import matplotlib.gridspec as gridspec
 3pl.figure(figsize=(7.5, 3))
 4
 5G = gridspec.GridSpec(10, 1)
 6ax1 = pl.subplot(G[:2,  0])
 7ax2 = pl.subplot(G[3:, 0])
 8
 9ax1.vlines(bus_time[:10], 0, 1, lw=2, color="blue", label=u"公交车")
10ptime = np.random.uniform(bus_time[0], bus_time[9], 100)
11ax1.vlines(ptime, 0, 1, lw=1, color="red", alpha=0.2, label=u"乘客")
12ax1.legend()
13count, bins = np.histogram(passenger_time, bins=bus_time)
14ax2.plot(np.diff(bins), count, ".", alpha=0.3, rasterized=True)
15ax2.set_xlabel(u"公交车的时间间隔")
16ax2.set_ylabel(u"等待人数");

png

1from scipy import integrate
2t = 10.0 / 3  # 两辆公交车之间的平均时间间隔
3bus_interval = stats.gamma(1, scale=t)
4n, _ = integrate.quad(lambda x: 0.5 * x * x * bus_interval.pdf(x), 0, 1000)
5d, _ = integrate.quad(lambda x: x * bus_interval.pdf(x), 0, 1000)
6n / d * 60
200.0

学生t-分布与t检验

 1#%fig=模拟学生t-分布
 2mu = 0.0
 3n = 10
 4samples = stats.norm(mu).rvs(size=(100000, n))  #❶
 5t_samples = (np.mean(samples, axis=1) - mu) / np.std(
 6    samples, ddof=1, axis=1) * n**0.5  #❷
 7sample_dist, x = np.histogram(t_samples, bins=100, density=True)  #❸
 8x = 0.5 * (x[:-1] + x[1:])
 9t_dist = stats.t(n - 1).pdf(x)
10print("max error:", np.max(np.abs(sample_dist - t_dist)))
11#%hide
12pl.plot(x, sample_dist, lw=2, label=u"样本分布")
13pl.plot(x, t_dist, lw=2, alpha=0.6, label=u"t分布")
14pl.xlim(-5, 5)
15pl.legend(loc="best")
max error: 0.006832108369761447





<matplotlib.legend.Legend at 0x20556d89e48>

png

 1#%figonly=当`df`增大,学生t-分布趋向于正态分布
 2fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
 3ax1.plot(x, stats.t(6-1).pdf(x), label=u"df=5", lw=2)
 4ax1.plot(x, stats.t(40-1).pdf(x), label=u"df=39", lw=2, alpha=0.6)
 5ax1.plot(x, stats.norm.pdf(x), "k--", label=u"norm")
 6ax1.legend()
 7
 8ax2.plot(x, stats.t(6-1).sf(x), label=u"df=5", lw=2)
 9ax2.plot(x, stats.t(40-1).sf(x), label=u"df=39", lw=2, alpha=0.6)
10ax2.plot(x, stats.norm.sf(x), "k--", label=u"norm")
11ax2.legend();

png

1n = 30
2np.random.seed(42)
3s = stats.norm.rvs(loc=1, scale=0.8, size=n)
1t = (np.mean(s) - 0.5) / (np.std(s, ddof=1) / np.sqrt(n))
2print (t, stats.ttest_1samp(s, 0.5))
2.658584340882224 Ttest_1sampResult(statistic=2.658584340882224, pvalue=0.01263770225709123)
1print ((np.mean(s) - 1) / (np.std(s, ddof=1) / np.sqrt(n)))
2print (stats.ttest_1samp(s, 1), stats.ttest_1samp(s, 0.9))
-1.1450173670383303
Ttest_1sampResult(statistic=-1.1450173670383303, pvalue=0.26156414618801477) Ttest_1sampResult(statistic=-0.3842970254542196, pvalue=0.7035619103425202)
 1#%fig=红色部分为`ttest_1samp()`计算的p值
 2x = np.linspace(-5, 5, 500)
 3y = stats.t(n-1).pdf(x)
 4plt.plot(x, y, lw=2)
 5t, p = stats.ttest_1samp(s, 0.5)
 6mask = x > np.abs(t)
 7plt.fill_between(x[mask], y[mask], color="red", alpha=0.5)
 8mask = x < -np.abs(t)
 9plt.fill_between(x[mask], y[mask], color="red", alpha=0.5)
10plt.axhline(color="k", lw=0.5)
11plt.xlim(-5, 5);

png

1from scipy import integrate
2x = np.linspace(-10, 10, 100000)
3y = stats.t(n-1).pdf(x)
4mask = x >= np.abs(t)
5integrate.trapz(y[mask], x[mask])*2
0.012633433707685974
1m = 200000
2mean = 0.5
3r = stats.norm.rvs(loc=mean, scale=0.8, size=(m, n))
4ts = (np.mean(s) - mean) / (np.std(s, ddof=1) / np.sqrt(n))
5tr = (np.mean(r, axis=1) - mean) / (np.std(r, ddof=1, axis=1) / np.sqrt(n))
6np.mean(np.abs(tr) > np.abs(ts))
0.012695
1np.random.seed(42)
2
3s1 = stats.norm.rvs(loc=1, scale=1.0, size=20)
4s2 = stats.norm.rvs(loc=1.5, scale=0.5, size=20)
5s3 = stats.norm.rvs(loc=1.5, scale=0.5, size=25)
6
7print (stats.ttest_ind(s1, s2, equal_var=False)) #❶
8print (stats.ttest_ind(s2, s3, equal_var=True))  #❷
Ttest_indResult(statistic=-2.2391470627176755, pvalue=0.033250866086743665)
Ttest_indResult(statistic=-0.5946698521856172, pvalue=0.5551805875810539)

卡方分布和卡方检验

 1#%fig=使用随机数验证卡方分布
 2a = np.random.normal(size=(300000, 4))
 3cs = np.sum(a**2, axis=1)
 4
 5sample_dist, bins = np.histogram(cs, bins=100, range=(0, 20), density=True)
 6x = 0.5 * (bins[:-1] + bins[1:])
 7chi2_dist = stats.chi2.pdf(x, 4)
 8print("max error:", np.max(np.abs(sample_dist - chi2_dist)))
 9#%hide
10pl.plot(x, sample_dist, lw=2, label=u"样本分布")
11pl.plot(x, chi2_dist, lw=2, alpha=0.6, label=u"$\chi ^{2}$分布")
12pl.legend(loc="best")
max error: 0.0030732520533635066





<matplotlib.legend.Legend at 0x20556ede198>

png

 1#%fig=模拟卡方分布
 2repeat_count = 60000
 3n, k = 100, 5
 4
 5np.random.seed(42)
 6ball_ids = np.random.randint(0, k, size=(repeat_count, n)) #❶
 7counts = np.apply_along_axis(np.bincount, 1, ball_ids, minlength=k) #❷
 8cs2 = np.sum((counts - n/k)**2.0/(n/k), axis=1) #❸
 9k = stats.kde.gaussian_kde(cs2) #❹
10x = np.linspace(0, 10, 200)
11pl.plot(x, stats.chi2.pdf(x, 4), lw=2, label=u"$\chi ^{2}$分布")
12pl.plot(x, k(x), lw=2, color="red", alpha=0.6, label=u"样本分布")
13pl.legend(loc="best")
14pl.xlim(0, 10);

png

 1def choose_balls(probabilities, size):
 2    r = stats.rv_discrete(values=(range(len(probabilities)), probabilities))
 3    s = r.rvs(size=size)
 4    counts = np.bincount(s)    
 5    return counts
 6
 7np.random.seed(42)
 8counts1 = choose_balls([0.18, 0.24, 0.25, 0.16, 0.17], 400)
 9counts2 = choose_balls([0.2]*5, 400)
10
11print(counts1)
12print(counts2)
[80 93 97 64 66]
[89 76 79 71 85]
1chi1, p1 = stats.chisquare(counts1)
2chi2, p2 = stats.chisquare(counts2)
3
4print ("chi1 =", chi1, "p1 =", p1)
5print ("chi2 =", chi2, "p2 =", p2)
chi1 = 11.375 p1 = 0.022657601239769634
chi2 = 2.55 p2 = 0.6357054527037017
 1#%figonly=卡方检验计算的概率为阴影部分的面积
 2x = np.linspace(0, 30, 200)
 3CHI2 = stats.chi2(4)
 4pl.plot(x, CHI2.pdf(x), "k", lw=2)
 5pl.vlines(chi1, 0, CHI2.pdf(chi1))
 6pl.vlines(chi2, 0, CHI2.pdf(chi2))
 7pl.fill_between(x[x>chi1], 0, CHI2.pdf(x[x>chi1]), color="red", alpha=1.0)
 8pl.fill_between(x[x>chi2], 0, CHI2.pdf(x[x>chi2]), color="green", alpha=0.5)
 9pl.text(chi1, 0.015, r"$\chi^2_1$", fontsize=14)
10pl.text(chi2, 0.015, r"$\chi^2_2$", fontsize=14)
11pl.ylim(0, 0.2)
12pl.xlim(0, 20);

png

1table = [[43, 9], [44, 4]]
2chi2, p, dof, expected = stats.chi2_contingency(table)
3print(chi2)
4print(p)
1.0724852071005921
0.300384770390566
1stats.fisher_exact(table)
(0.43434343434343436, 0.23915695682224306)