4.scipy Stats
title: "4.scipy-stats" date: 2026-05-24T14:11:12Z
1import numpy as np
2import pylab as pl
3from scipy import stats
1import matplotlib.pyplot as plt
2import matplotlib as mpl
3mpl.rcParams['font.sans-serif'] = ['SimHei']
统计-stats
连续概率分布
1from scipy import stats
2[k for k, v in stats.__dict__.items() if isinstance(v, stats.rv_continuous)]
['ksone',
'kstwobign',
'norm',
'alpha',
'anglit',
'arcsine',
'beta',
'betaprime',
'bradford',
'burr',
'burr12',
'fisk',
'cauchy',
'chi',
'chi2',
'cosine',
'dgamma',
'dweibull',
'expon',
'exponnorm',
'exponweib',
'exponpow',
'fatiguelife',
'foldcauchy',
'f',
'foldnorm',
'weibull_min',
'weibull_max',
'frechet_r',
'frechet_l',
'genlogistic',
'genpareto',
'genexpon',
'genextreme',
'gamma',
'erlang',
'gengamma',
'genhalflogistic',
'gompertz',
'gumbel_r',
'gumbel_l',
'halfcauchy',
'halflogistic',
'halfnorm',
'hypsecant',
'gausshyper',
'invgamma',
'invgauss',
'invweibull',
'johnsonsb',
'johnsonsu',
'laplace',
'levy',
'levy_l',
'levy_stable',
'logistic',
'loggamma',
'loglaplace',
'lognorm',
'gilbrat',
'maxwell',
'mielke',
'kappa4',
'kappa3',
'nakagami',
'ncx2',
'ncf',
't',
'nct',
'pareto',
'lomax',
'pearson3',
'powerlaw',
'powerlognorm',
'powernorm',
'rdist',
'rayleigh',
'reciprocal',
'rice',
'recipinvgauss',
'semicircular',
'skewnorm',
'trapz',
'triang',
'truncexpon',
'truncnorm',
'tukeylambda',
'uniform',
'vonmises',
'vonmises_line',
'wald',
'wrapcauchy',
'gennorm',
'halfgennorm',
'crystalball',
'argus']
1stats.norm.stats()
(array(0.), array(1.))
1X = stats.norm(loc=1.0, scale=2.0)
2X.stats()
(array(1.), array(4.))
1x = X.rvs(size=10000) # 对随机变量取10000个值
2np.mean(x), np.var(x) # 期望值和方差
(1.0048352738823323, 3.9372117720073554)
1stats.norm.fit(x) # 得到随机序列期望值和标准差
(1.0048352738823323, 1.984240855341749)
1pdf, t = np.histogram(x, bins=100, normed=True) #❶
2t = (t[:-1] + t[1:]) * 0.5 #❷
3cdf = np.cumsum(pdf) * (t[1] - t[0]) #❸
4p_error = pdf - X.pdf(t)
5c_error = cdf - X.cdf(t)
6print ("max pdf error: {}, max cdf error: {}".format(
7 np.abs(p_error).max(),
8 np.abs(c_error).max()))
max pdf error: 0.018998755595167102, max cdf error: 0.018503342378306975
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: VisibleDeprecationWarning: Passing `normed=True` on non-uniform bins has always been broken, and computes neither the probability density function nor the probability mass function. The result is only correct if the bins are uniform, when density=True will produce the same result anyway. The argument will be removed in a future version of numpy.
"""Entry point for launching an IPython kernel.
1#%figonly=正态分布的概率密度函数(左)和累积分布函数(右)
2fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7, 2))
3ax1.plot(t, pdf, label=u"统计值")
4ax1.plot(t, X.pdf(t), label=u"理论值", alpha=0.6)
5ax1.legend(loc="best")
6ax2.plot(t, cdf)
7ax2.plot(t, X.cdf(t), alpha=0.6);
1print(stats.gamma.stats(1.0))
2print(stats.gamma.stats(2.0))
(array(1.), array(1.))
(array(2.), array(2.))
1stats.gamma.stats(2.0, scale=2)
(array(4.), array(8.))
1x = stats.gamma.rvs(2, scale=2, size=4)
2x
array([4.40563983, 6.17699951, 3.65503843, 3.28052152])
1stats.gamma.pdf(x, 2, scale=2)
array([0.12169605, 0.07037188, 0.14694352, 0.15904745])
1X = stats.gamma(2, scale=2)
2X.pdf(x)
array([0.12169605, 0.07037188, 0.14694352, 0.15904745])
离散概率分布
1x = range(1, 7)
2p = (0.4, 0.2, 0.1, 0.1, 0.1, 0.1)
1dice = stats.rv_discrete(values=(x, p))
2dice.rvs(size=20)
array([2, 5, 2, 6, 1, 6, 6, 5, 3, 1, 5, 2, 1, 1, 1, 1, 1, 2, 1, 6])
1np.random.seed(42)
2samples = dice.rvs(size=(20000, 50))
3samples_mean = np.mean(samples, axis=1)
核密度估计
1#%fig=核密度估计能更准确地表示随机变量的概率密度函数
2_, bins, step = pl.hist(
3 samples_mean, bins=100, normed=True, histtype="step", label=u"直方图统计")
4kde = stats.kde.gaussian_kde(samples_mean)
5x = np.linspace(bins[0], bins[-1], 100)
6pl.plot(x, kde(x), label=u"核密度估计")
7mean, std = stats.norm.fit(samples_mean)
8pl.plot(x, stats.norm(mean, std).pdf(x), alpha=0.8, label=u"正态分布拟合")
9pl.legend()
<matplotlib.legend.Legend at 0x20556d5f5c0>
1#%fig=`bw_method`参数越大核密度估计曲线越平滑
2for bw in [0.2, 0.3, 0.6, 1.0]:
3 kde = stats.gaussian_kde([-1, 0, 1], bw_method=bw)
4 x = np.linspace(-5, 5, 1000)
5 y = kde(x)
6 pl.plot(x, y, lw=2, label="bw={}".format(bw), alpha=0.6)
7pl.legend(loc="best");
二项、泊松、伽玛分布
1stats.binom.pmf(range(6), 5, 1/6.0)
array([4.01877572e-01, 4.01877572e-01, 1.60751029e-01, 3.21502058e-02,
3.21502058e-03, 1.28600823e-04])
1#%fig=当n足够大时二项分布和泊松分布近似相等
2lambda_ = 10.0
3x = np.arange(20)
4
5n1, n2 = 100, 1000
6
7y_binom_n1 = stats.binom.pmf(x, n1, lambda_ / n1)
8y_binom_n2 = stats.binom.pmf(x, n2, lambda_ / n2)
9y_poisson = stats.poisson.pmf(x, lambda_)
10print(np.max(np.abs(y_binom_n1 - y_poisson)))
11print(np.max(np.abs(y_binom_n2 - y_poisson)))
12#%hide
13fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
14
15ax1.plot(x, y_binom_n1, label=u"binom", lw=2)
16ax1.plot(x, y_poisson, label=u"poisson", lw=2, color="red")
17ax2.plot(x, y_binom_n2, label=u"binom", lw=2)
18ax2.plot(x, y_poisson, label=u"poisson", lw=2, color="red")
19for n, ax in zip((n1, n2), (ax1, ax2)):
20 ax.set_xlabel(u"次数")
21 ax.set_ylabel(u"概率")
22 ax.set_title("n={}".format(n))
23 ax.legend()
24fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1)
0.00675531110335309
0.0006301754049777564
1#%fig=模拟泊松分布
2np.random.seed(42)
3
4
5def sim_poisson(lambda_, time):
6 t = np.random.uniform(0, time, size=lambda_ * time) #❶
7 count, time_edges = np.histogram(t, bins=time, range=(0, time)) #❷
8 dist, count_edges = np.histogram(
9 count, bins=20, range=(0, 20), density=True) #❸
10 x = count_edges[:-1]
11 poisson = stats.poisson.pmf(x, lambda_)
12 return x, poisson, dist
13
14
15lambda_ = 10
16times = 1000, 50000
17x1, poisson1, dist1 = sim_poisson(lambda_, times[0])
18x2, poisson2, dist2 = sim_poisson(lambda_, times[1])
19max_error1 = np.max(np.abs(dist1 - poisson1))
20max_error2 = np.max(np.abs(dist2 - poisson2))
21print("time={}, max_error={}".format(times[0], max_error1))
22print("time={}, max_error={}".format(times[1], max_error2))
23#%hide
24fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
25
26ax1.plot(x1, dist1, "-o", lw=2, label=u"统计结果")
27ax1.plot(x1, poisson1, "->", lw=2, label=u"泊松分布", color="red", alpha=0.6)
28ax2.plot(x2, dist2, "-o", lw=2, label=u"统计结果")
29ax2.plot(x2, poisson2, "->", lw=2, label=u"泊松分布", color="red", alpha=0.6)
30
31for ax, time in zip((ax1, ax2), times):
32 ax.set_xlabel(u"次数")
33 ax.set_ylabel(u"概率")
34 ax.set_title(u"time = {}".format(time))
35 ax.legend(loc="lower center")
36
37fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1)
time=1000, max_error=0.01964230201602718
time=50000, max_error=0.001798012894964722
1#%fig=模拟伽玛分布
2def sim_gamma(lambda_, time, k):
3 t = np.random.uniform(0, time, size=lambda_ * time) #❶
4 t.sort() #❷
5 interval = t[k:] - t[:-k] #❸
6 dist, interval_edges = np.histogram(interval, bins=100, density=True) #❹
7 x = (interval_edges[1:] + interval_edges[:-1])/2 #❺
8 gamma = stats.gamma.pdf(x, k, scale=1.0/lambda_) #❺
9 return x, gamma, dist
10
11lambda_ = 10
12time = 1000
13ks = 1, 2
14x1, gamma1, dist1 = sim_gamma(lambda_, time, ks[0])
15x2, gamma2, dist2 = sim_gamma(lambda_, time, ks[1])
16#%hide
17fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
18
19ax1.plot(x1, dist1, lw=2, label=u"统计结果")
20ax1.plot(x1, gamma1, lw=2, label=u"伽玛分布", color="red", alpha=0.6)
21ax2.plot(x2, dist2, lw=2, label=u"统计结果")
22ax2.plot(x2, gamma2, lw=2, label=u"伽玛分布", color="red", alpha=0.6)
23
24for ax, k in zip((ax1, ax2), ks):
25 ax.set_xlabel(u"时间间隔")
26 ax.set_ylabel(u"概率密度")
27 ax.set_title(u"k = {}".format(k))
28 ax.legend(loc="upper right")
29
30fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1);
1T = 100000
2A_count = int(T / 5)
3B_count = int(T / 10)
4
5A_time = np.random.uniform(0, T, A_count) #❶
6B_time = np.random.uniform(0, T, B_count)
7
8bus_time = np.concatenate((A_time, B_time)) #❷
9bus_time.sort()
10
11N = 200000
12passenger_time = np.random.uniform(bus_time[0], bus_time[-1], N) #❸
13
14idx = np.searchsorted(bus_time, passenger_time) #❹
15np.mean(bus_time[idx] - passenger_time) * 60 #❺
202.3388747879705
1np.mean(np.diff(bus_time)) * 60
199.99833251643057
1#%figonly=观察者偏差
2import matplotlib.gridspec as gridspec
3pl.figure(figsize=(7.5, 3))
4
5G = gridspec.GridSpec(10, 1)
6ax1 = pl.subplot(G[:2, 0])
7ax2 = pl.subplot(G[3:, 0])
8
9ax1.vlines(bus_time[:10], 0, 1, lw=2, color="blue", label=u"公交车")
10ptime = np.random.uniform(bus_time[0], bus_time[9], 100)
11ax1.vlines(ptime, 0, 1, lw=1, color="red", alpha=0.2, label=u"乘客")
12ax1.legend()
13count, bins = np.histogram(passenger_time, bins=bus_time)
14ax2.plot(np.diff(bins), count, ".", alpha=0.3, rasterized=True)
15ax2.set_xlabel(u"公交车的时间间隔")
16ax2.set_ylabel(u"等待人数");
1from scipy import integrate
2t = 10.0 / 3 # 两辆公交车之间的平均时间间隔
3bus_interval = stats.gamma(1, scale=t)
4n, _ = integrate.quad(lambda x: 0.5 * x * x * bus_interval.pdf(x), 0, 1000)
5d, _ = integrate.quad(lambda x: x * bus_interval.pdf(x), 0, 1000)
6n / d * 60
200.0
学生t-分布与t检验
1#%fig=模拟学生t-分布
2mu = 0.0
3n = 10
4samples = stats.norm(mu).rvs(size=(100000, n)) #❶
5t_samples = (np.mean(samples, axis=1) - mu) / np.std(
6 samples, ddof=1, axis=1) * n**0.5 #❷
7sample_dist, x = np.histogram(t_samples, bins=100, density=True) #❸
8x = 0.5 * (x[:-1] + x[1:])
9t_dist = stats.t(n - 1).pdf(x)
10print("max error:", np.max(np.abs(sample_dist - t_dist)))
11#%hide
12pl.plot(x, sample_dist, lw=2, label=u"样本分布")
13pl.plot(x, t_dist, lw=2, alpha=0.6, label=u"t分布")
14pl.xlim(-5, 5)
15pl.legend(loc="best")
max error: 0.006832108369761447
<matplotlib.legend.Legend at 0x20556d89e48>
1#%figonly=当`df`增大,学生t-分布趋向于正态分布
2fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
3ax1.plot(x, stats.t(6-1).pdf(x), label=u"df=5", lw=2)
4ax1.plot(x, stats.t(40-1).pdf(x), label=u"df=39", lw=2, alpha=0.6)
5ax1.plot(x, stats.norm.pdf(x), "k--", label=u"norm")
6ax1.legend()
7
8ax2.plot(x, stats.t(6-1).sf(x), label=u"df=5", lw=2)
9ax2.plot(x, stats.t(40-1).sf(x), label=u"df=39", lw=2, alpha=0.6)
10ax2.plot(x, stats.norm.sf(x), "k--", label=u"norm")
11ax2.legend();
1n = 30
2np.random.seed(42)
3s = stats.norm.rvs(loc=1, scale=0.8, size=n)
1t = (np.mean(s) - 0.5) / (np.std(s, ddof=1) / np.sqrt(n))
2print (t, stats.ttest_1samp(s, 0.5))
2.658584340882224 Ttest_1sampResult(statistic=2.658584340882224, pvalue=0.01263770225709123)
1print ((np.mean(s) - 1) / (np.std(s, ddof=1) / np.sqrt(n)))
2print (stats.ttest_1samp(s, 1), stats.ttest_1samp(s, 0.9))
-1.1450173670383303
Ttest_1sampResult(statistic=-1.1450173670383303, pvalue=0.26156414618801477) Ttest_1sampResult(statistic=-0.3842970254542196, pvalue=0.7035619103425202)
1#%fig=红色部分为`ttest_1samp()`计算的p值
2x = np.linspace(-5, 5, 500)
3y = stats.t(n-1).pdf(x)
4plt.plot(x, y, lw=2)
5t, p = stats.ttest_1samp(s, 0.5)
6mask = x > np.abs(t)
7plt.fill_between(x[mask], y[mask], color="red", alpha=0.5)
8mask = x < -np.abs(t)
9plt.fill_between(x[mask], y[mask], color="red", alpha=0.5)
10plt.axhline(color="k", lw=0.5)
11plt.xlim(-5, 5);
1from scipy import integrate
2x = np.linspace(-10, 10, 100000)
3y = stats.t(n-1).pdf(x)
4mask = x >= np.abs(t)
5integrate.trapz(y[mask], x[mask])*2
0.012633433707685974
1m = 200000
2mean = 0.5
3r = stats.norm.rvs(loc=mean, scale=0.8, size=(m, n))
4ts = (np.mean(s) - mean) / (np.std(s, ddof=1) / np.sqrt(n))
5tr = (np.mean(r, axis=1) - mean) / (np.std(r, ddof=1, axis=1) / np.sqrt(n))
6np.mean(np.abs(tr) > np.abs(ts))
0.012695
1np.random.seed(42)
2
3s1 = stats.norm.rvs(loc=1, scale=1.0, size=20)
4s2 = stats.norm.rvs(loc=1.5, scale=0.5, size=20)
5s3 = stats.norm.rvs(loc=1.5, scale=0.5, size=25)
6
7print (stats.ttest_ind(s1, s2, equal_var=False)) #❶
8print (stats.ttest_ind(s2, s3, equal_var=True)) #❷
Ttest_indResult(statistic=-2.2391470627176755, pvalue=0.033250866086743665)
Ttest_indResult(statistic=-0.5946698521856172, pvalue=0.5551805875810539)
卡方分布和卡方检验
1#%fig=使用随机数验证卡方分布
2a = np.random.normal(size=(300000, 4))
3cs = np.sum(a**2, axis=1)
4
5sample_dist, bins = np.histogram(cs, bins=100, range=(0, 20), density=True)
6x = 0.5 * (bins[:-1] + bins[1:])
7chi2_dist = stats.chi2.pdf(x, 4)
8print("max error:", np.max(np.abs(sample_dist - chi2_dist)))
9#%hide
10pl.plot(x, sample_dist, lw=2, label=u"样本分布")
11pl.plot(x, chi2_dist, lw=2, alpha=0.6, label=u"$\chi ^{2}$分布")
12pl.legend(loc="best")
max error: 0.0030732520533635066
<matplotlib.legend.Legend at 0x20556ede198>
1#%fig=模拟卡方分布
2repeat_count = 60000
3n, k = 100, 5
4
5np.random.seed(42)
6ball_ids = np.random.randint(0, k, size=(repeat_count, n)) #❶
7counts = np.apply_along_axis(np.bincount, 1, ball_ids, minlength=k) #❷
8cs2 = np.sum((counts - n/k)**2.0/(n/k), axis=1) #❸
9k = stats.kde.gaussian_kde(cs2) #❹
10x = np.linspace(0, 10, 200)
11pl.plot(x, stats.chi2.pdf(x, 4), lw=2, label=u"$\chi ^{2}$分布")
12pl.plot(x, k(x), lw=2, color="red", alpha=0.6, label=u"样本分布")
13pl.legend(loc="best")
14pl.xlim(0, 10);
1def choose_balls(probabilities, size):
2 r = stats.rv_discrete(values=(range(len(probabilities)), probabilities))
3 s = r.rvs(size=size)
4 counts = np.bincount(s)
5 return counts
6
7np.random.seed(42)
8counts1 = choose_balls([0.18, 0.24, 0.25, 0.16, 0.17], 400)
9counts2 = choose_balls([0.2]*5, 400)
10
11print(counts1)
12print(counts2)
[80 93 97 64 66]
[89 76 79 71 85]
1chi1, p1 = stats.chisquare(counts1)
2chi2, p2 = stats.chisquare(counts2)
3
4print ("chi1 =", chi1, "p1 =", p1)
5print ("chi2 =", chi2, "p2 =", p2)
chi1 = 11.375 p1 = 0.022657601239769634
chi2 = 2.55 p2 = 0.6357054527037017
1#%figonly=卡方检验计算的概率为阴影部分的面积
2x = np.linspace(0, 30, 200)
3CHI2 = stats.chi2(4)
4pl.plot(x, CHI2.pdf(x), "k", lw=2)
5pl.vlines(chi1, 0, CHI2.pdf(chi1))
6pl.vlines(chi2, 0, CHI2.pdf(chi2))
7pl.fill_between(x[x>chi1], 0, CHI2.pdf(x[x>chi1]), color="red", alpha=1.0)
8pl.fill_between(x[x>chi2], 0, CHI2.pdf(x[x>chi2]), color="green", alpha=0.5)
9pl.text(chi1, 0.015, r"$\chi^2_1$", fontsize=14)
10pl.text(chi2, 0.015, r"$\chi^2_2$", fontsize=14)
11pl.ylim(0, 0.2)
12pl.xlim(0, 20);
1table = [[43, 9], [44, 4]]
2chi2, p, dof, expected = stats.chi2_contingency(table)
3print(chi2)
4print(p)
1.0724852071005921
0.300384770390566
1stats.fisher_exact(table)
(0.43434343434343436, 0.23915695682224306)