import numpy as np import pandas as pd import matplotlib.pyplot as plt # 使用pandas和numpy生成一组仿真数据 s = pd.DataFrame(np.random.randn(500),columns=['value']) print(s.shape) # (500, 1) # 创建自定义图像 fig = plt.figure(figsize=(10, 6)) # 创建子图1 ax1 = fig.add_subplot(2,1,1) # 绘制散点图 ax1.scatter(s.index, s.values) plt.grid() # 添加网格 # 创建子图2 ax2 = fig.add_subplot(2, 1, 2) # 绘制直方图 s.hist(bins=30,alpha=0.5,ax=ax2) # 绘制密度图 s.plot(kind='kde', secondary_y=True,ax=ax2) # 使用双坐标轴 plt.grid() # 添加网格 # 显示自定义图像
def kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='approx'): """ Perform the Kolmogorov-Smirnov test for goodness of fit. This performs a test of the distribution F(x) of an observed random variable against a given distribution G(x). Under the null hypothesis the two distributions are identical, F(x)=G(x). The alternative hypothesis can be either 'two-sided' (default), 'less' or 'greater'. The KS test is only valid for continuous distributions. Parameters ---------- rvs : str, array or callable If a string, it should be the name of a distribution in `scipy.stats`. If an array, it should be a 1-D array of observations of random variables. If a callable, it should be a function to generate random variables; it is required to have a keyword argument `size`. cdf : str or callable If a string, it should be the name of a distribution in `scipy.stats`. If `rvs` is a string then `cdf` can be False or the same as `rvs`. If a callable, that callable is used to calculate the cdf. args : tuple, sequence, optional Distribution parameters, used if `rvs` or `cdf` are strings. N : int, optional Sample size if `rvs` is string or callable. Default is 20. alternative : {'two-sided', 'less','greater'}, optional Defines the alternative hypothesis (see explanation above). Default is 'two-sided'. mode : 'approx' (default) or 'asymp', optional Defines the distribution used for calculating the p-value. - 'approx' : use approximation to exact distribution of test statistic - 'asymp' : use asymptotic distribution of test statistic Returns ------- statistic : float KS test statistic, either D, D+ or D-. pvalue : float One-tailed or two-tailed p-value.
def normaltest(a, axis=0, nan_policy='propagate'): """ Test whether a sample differs from a normal distribution. This function tests the null hypothesis that a sample comes from a normal distribution. It is based on D'Agostino and Pearson's [1]_, [2]_ test that combines skew and kurtosis to produce an omnibus test of normality. Parameters ---------- a : array_like The array containing the sample to be tested. axis : int or None, optional Axis along which to compute test. Default is 0. If None, compute over the whole array `a`. nan_policy : {'propagate', 'raise', 'omit'}, optional Defines how to handle when input contains nan. 'propagate' returns nan, 'raise' throws an error, 'omit' performs the calculations ignoring nan values. Default is 'propagate'. Returns ------- statistic : float or array ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and ``k`` is the z-score returned by `kurtosistest`. pvalue : float or array A 2-sided chi squared probability for the hypothesis test.
def shapiro(x): """ Perform the Shapiro-Wilk test for normality. The Shapiro-Wilk test tests the null hypothesis that the data was drawn from a normal distribution. Parameters ---------- x : array_like Array of sample data. Returns ------- W : float The test statistic. p-value : float The p-value for the hypothesis test.
下面我们使用第一部分生成的仿真数据,用这三种统计检验函数检验生成的样本是否服从正态分布(p > 0.05),代码如下:
# 计算均值
u = s['value'].mean()
# 计算标准差
std = s['value'].std()
# 计算标准差
print('scipy.stats.kstest统计检验结果:----------------------------------------------------')
print(stats.kstest(s['value'], 'norm', (u, std)))
print('scipy.stats.normaltest统计检验结果:----------------------------------------------------')
print(stats.normaltest(s['value']))
print('scipy.stats.shapiro统计检验结果:----------------------------------------------------')
print(stats.shapiro(s['value']))
KstestResult(statistic=0.01596290473494305, pvalue=0.9995623150120069)
NormaltestResult(statistic=0.5561685865675511, pvalue=0.7572329891688141)
(0.9985257983207703, 0.9540967345237732)