Source code for benford.stats

from numpy import abs as nabs, errstate, linspace, log, sqrt, where
from .constants import CRIT_CHI2, CRIT_KS, MAD_CONFORM, DIGS


[docs]def Z_score(frame, N): """Computes the Z statistics for the proportions studied Args: frame: DataFrame with the expected proportions and the already calculated Absolute Diferences between the found and expeccted proportions N: sample size Returns: Series of computed Z scores """ return (frame.AbsDif - (1 / (2 * N))) / sqrt( (frame.Expected * (1. - frame.Expected)) / N)
[docs]def chi_sq(frame, ddf, confidence, verbose=True): """Comnputes the chi-square statistic of the found distributions and compares it with the critical chi-square of such a sample, according to the confidence level chosen and the degrees of freedom - len(sample) -1. Args: frame: DataFrame with Found, Expected and their difference columns. ddf: Degrees of freedom to consider. confidence: Confidence level to look up critical value. verbose: prints the chi-squre result and compares to the critical chi-square for the sample. Defaults to True. Returns: The computed Chi square statistic and the critical chi square (according) to the degrees of freedom and confidence level, for comparison. None if confidence is None """ if confidence is None: print('\nChi-square test needs confidence other than None.') return else: exp_counts = frame.Counts.sum() * frame.Expected dif_counts = frame.Counts - exp_counts found_chi = (dif_counts ** 2 / exp_counts).sum() crit_chi = CRIT_CHI2[ddf][confidence] if verbose: print(f"\nThe Chi-square statistic is {found_chi:.4f}.\n" f"Critical Chi-square for this series: {crit_chi}.") return (found_chi, crit_chi)
[docs]def chi_sq_2(frame): """Computes the chi-square statistic of the found distributions Args: frame: DataFrame with Found, Expected and their difference columns. Returns: The computed Chi square statistic """ exp_counts = frame.Counts.sum() * frame.Expected dif_counts = frame.Counts - exp_counts return (dif_counts ** 2 / exp_counts).sum()
[docs]def kolmogorov_smirnov(frame, confidence, N, verbose=True): """Computes the Kolmogorov-Smirnov test of the found distributions and compares it with the critical chi-square of such a sample, according to the confidence level chosen. Args: frame: DataFrame with Foud and Expected distributions. confidence: Confidence level to look up critical value. N: Sample size verbose: prints the KS result and the critical value for the sample. Defaults to True. Returns: The Suprem, which is the greatest absolute difference between the Found and the expected proportions, and the Kolmogorov-Smirnov critical value according to the confidence level, for ccomparison """ if confidence is None: print('\nKolmogorov-Smirnov test needs confidence other than None.') return else: # sorting and calculating the cumulative distribution ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum() # finding the supremum - the largest cumul dist difference suprem = ((ks_frame.Found - ks_frame.Expected).abs()).max() # calculating the crittical value according to confidence crit_KS = CRIT_KS[confidence] / sqrt(N) if verbose: print(f"\nThe Kolmogorov-Smirnov statistic is {suprem:.4f}.\n" f"Critical K-S for this series: {crit_KS:.4f}") return (suprem, crit_KS)
[docs]def kolmogorov_smirnov_2(frame): """Computes the Kolmogorov-Smirnov test of the found distributions Args: frame: DataFrame with Foud and Expected distributions. Returns: The Suprem, which is the greatest absolute difference between the Found end th expected proportions """ # sorting and calculating the cumulative distribution ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum() # finding the supremum - the largest cumul dist difference return ((ks_frame.Found - ks_frame.Expected).abs()).max()
def _two_dist_ks_(dist1, dist2, cummulative=True): """Computes the Kolmogorov-Smirnov statistic between two distributions, a found one (dist2) and an expected one (dist1). Args: dist1 (np.arrat): array with the expected distribution dist2 (np.array): array with the found distribution cummulative (bool): makes apply cummulutative sum to the distributions (empirical cdf). Returns: tuple(floats): the KS statistic """ dist2.sort(); dist1.sort() if not cummulative: return nabs(dist2 - dist1).max() return nabs(dist2.cumsum() - dist1.cumsum()).max() def _mantissas_ks_(mant_dist, confidence, sample_size): """Computes the Kolmogorov-Smirnof statistic for the Mantissas, also providing the KS critical value according the the sample size and confidence level provided Args: mant_dist (np.array): array with the mantissas distribution found confidence (float, int): level of confidence to compute the critical value Returns: tuple(floats): the KS statistic and the critical value """ crit_ks = CRIT_KS[confidence] * sqrt(2 * sample_size / sample_size ** 2)\ if confidence else None # non-cummulative, uniformly distributed expected = linspace(0, 1, len(mant_dist), endpoint=False) ks = _two_dist_ks_(expected, mant_dist, cummulative=False) return ks, crit_ks
[docs]def mad(frame, test, verbose=True): """Computes the Mean Absolute Deviation (MAD) between the found and the expected proportions. Args: frame: DataFrame with the Absolute Deviations already calculated. test: Test to compute the MAD from (F1D, SD, F2D...) verbose: prints the MAD result and compares to limit values of conformity. Defaults to True. Returns: The Mean of the Absolute Deviations between the found and expected proportions. """ mad = frame.AbsDif.mean() if verbose: print(f"\nThe Mean Absolute Deviation is {mad}") if test != -2: print(f"For the {MAD_CONFORM[DIGS[test]]}:\n\ - 0.0000 to {MAD_CONFORM[test][0]}: Close Conformity\n\ - {MAD_CONFORM[test][0]} to {MAD_CONFORM[test][1]}: Acceptable Conformity\n\ - {MAD_CONFORM[test][1]} to {MAD_CONFORM[test][2]}: Marginally Acceptable Conformity\n\ - Above {MAD_CONFORM[test][2]}: Nonconformity") else: pass return mad
[docs]def mse(frame, verbose=True): """Computes the test's Mean Square Error Args: frame: DataFrame with the already computed Absolute Deviations between the found and expected proportions verbose: Prints the MSE. Defaults to True. Returns: Mean of the squared differences between the found and the expected proportions. """ mse = (frame.AbsDif ** 2).mean() if verbose: print(f"\nMean Square Error = {mse}") return mse
def _bhattacharyya_coefficient(dist_1, dist_2): """Computes the Bhattacharyya Coeficient between two probability distributions, to be letar used to compute the Bhattacharyya Distance Args: dist_1 (np.array): The newly gathered distribution, to be compared with an older / established distribution. dist_2 (np.array): The older/ establhished distribution with which the new one will be compared. Returns: bhat_coef (float) """ return sqrt(dist_1 * dist_2).sum() def _bhattacharyya_distance_(dist_1, dist_2): """Computes the Bhattacharyya Dsitance between two probability distributions Args: dist_1 (np.array): The newly gathered distribution, to be compared with an older / established distribution. dist_2 (np.array): The older/ establhished distribution with which the new one will be compared. Returns: bhat_dist (float) """ with errstate(divide='ignore'): bhat_dist = -log(_bhattacharyya_coefficient(dist_1, dist_2)) return bhat_dist def _kullback_leibler_divergence_(dist_1, dist_2): """Computes the Kullback-Leibler Divergence between two probability distributions. Args: dist_1 (np.array): The newly gathered distribution, to be compared with an older / established distribution. dist_2 (np.array): The older/ establhished distribution with which the new one will be compared. Returns: kulb_leib_diverg (float) """ # ignore divide by zero warning in np.where with errstate(divide='ignore'): kl_d = (log((dist_1 / dist_2), where=(dist_1 != 0)) * dist_1).sum() return kl_d