Source code for benford.stats

from numpy import abs as nabs, errstate, linspace, log, sqrt, where
from .constants import CRIT_CHI2, CRIT_KS, MAD_CONFORM, DIGS


[docs]def Z_score(frame, N):
    """Computes the Z statistics for the proportions studied

    Args:
        frame: DataFrame with the expected proportions and the already calculated
            Absolute Diferences between the found and expeccted proportions
        N: sample size

    Returns:
        Series of computed Z scores
    """
    return (frame.AbsDif - (1 / (2 * N))) / sqrt(
           (frame.Expected * (1. - frame.Expected)) / N)


[docs]def chi_sq(frame, ddf, confidence, verbose=True):
    """Comnputes the chi-square statistic of the found distributions and compares
    it with the critical chi-square of such a sample, according to the
    confidence level chosen and the degrees of freedom - len(sample) -1.

    Args:
        frame: DataFrame with Found, Expected and their difference columns.
        ddf: Degrees of freedom to consider.
        confidence: Confidence level to look up critical value.
        verbose: prints the chi-squre result and compares to the critical
            chi-square for the sample. Defaults to True.

    Returns:
        The computed Chi square statistic and the critical chi square
            (according) to the degrees of freedom and confidence level,
            for comparison. None if confidence is None
    """
    if confidence is None:
        print('\nChi-square test needs confidence other than None.')
        return
    else:
        exp_counts = frame.Counts.sum() * frame.Expected
        dif_counts = frame.Counts - exp_counts
        found_chi = (dif_counts ** 2 / exp_counts).sum()
        crit_chi = CRIT_CHI2[ddf][confidence]
        if verbose:
            print(f"\nThe Chi-square statistic is {found_chi:.4f}.\n"
                  f"Critical Chi-square for this series: {crit_chi}.")
        return (found_chi, crit_chi)


[docs]def chi_sq_2(frame):
    """Computes the chi-square statistic of the found distributions

    Args:
        frame: DataFrame with Found, Expected and their difference columns.

    Returns:
        The computed Chi square statistic 
    """
    exp_counts = frame.Counts.sum() * frame.Expected
    dif_counts = frame.Counts - exp_counts
    return (dif_counts ** 2 / exp_counts).sum()


[docs]def kolmogorov_smirnov(frame, confidence, N, verbose=True):
    """Computes the Kolmogorov-Smirnov test of the found distributions
    and compares it with the critical chi-square of such a sample,
    according to the confidence level chosen.

    Args:
        frame: DataFrame with Foud and Expected distributions.
        confidence: Confidence level to look up critical value.
        N: Sample size
        verbose: prints the KS result and the critical value for the sample.
            Defaults to True.

    Returns:
        The Suprem, which is the greatest absolute difference between the
            Found and the expected proportions, and the Kolmogorov-Smirnov
            critical value according to the confidence level, for ccomparison
    """
    if confidence is None:
        print('\nKolmogorov-Smirnov test needs confidence other than None.')
        return
    else:
        # sorting and calculating the cumulative distribution
        ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum()
        # finding the supremum - the largest cumul dist difference
        suprem = ((ks_frame.Found - ks_frame.Expected).abs()).max()
        # calculating the crittical value according to confidence
        crit_KS = CRIT_KS[confidence] / sqrt(N)

        if verbose:
            print(f"\nThe Kolmogorov-Smirnov statistic is {suprem:.4f}.\n"
                  f"Critical K-S for this series: {crit_KS:.4f}")
        return (suprem, crit_KS)


[docs]def kolmogorov_smirnov_2(frame):
    """Computes the Kolmogorov-Smirnov test of the found distributions

    Args:
        frame: DataFrame with Foud and Expected distributions.

    Returns:
        The Suprem, which is the greatest absolute difference between the
            Found end th expected proportions
    """
    # sorting and calculating the cumulative distribution
    ks_frame = frame.sort_index()[['Found', 'Expected']].cumsum()
    # finding the supremum - the largest cumul dist difference
    return ((ks_frame.Found - ks_frame.Expected).abs()).max()


def _two_dist_ks_(dist1, dist2, cummulative=True):
    """Computes the Kolmogorov-Smirnov statistic between two distributions,
    a found one (dist2) and an expected one (dist1).

    Args:
        dist1 (np.arrat): array with the expected distribution
        dist2 (np.array): array with the found distribution
        cummulative (bool): makes apply cummulutative sum to the
            distributions (empirical cdf).

    Returns:
        tuple(floats): the KS statistic 
    """
    dist2.sort(); dist1.sort()
    if not cummulative:
        return nabs(dist2 - dist1).max()
    return nabs(dist2.cumsum() - dist1.cumsum()).max()


def _mantissas_ks_(mant_dist, confidence, sample_size):
    """Computes the Kolmogorov-Smirnof statistic for the Mantissas, also
    providing the KS critical value according the the sample size and
    confidence level provided

    Args:
        mant_dist (np.array): array with the mantissas distribution found
        confidence (float, int): level of confidence to compute the critical
            value

    Returns:
        tuple(floats): the KS statistic and the critical value
    """ 
    crit_ks = CRIT_KS[confidence] * sqrt(2 * sample_size / sample_size ** 2)\
                if confidence else None
    # non-cummulative, uniformly distributed
    expected = linspace(0, 1, len(mant_dist), endpoint=False)
    ks = _two_dist_ks_(expected, mant_dist, cummulative=False)
    return ks, crit_ks


[docs]def mad(frame, test, verbose=True):
    """Computes the Mean Absolute Deviation (MAD) between the found and the
    expected proportions.

    Args:
        frame: DataFrame with the Absolute Deviations already calculated.
        test: Test to compute the MAD from (F1D, SD, F2D...)
        verbose: prints the MAD result and compares to limit values of
            conformity. Defaults to True.

    Returns:
        The Mean of the Absolute Deviations between the found and expected
            proportions. 
    """
    mad = frame.AbsDif.mean()

    if verbose:
        print(f"\nThe Mean Absolute Deviation is {mad}")

        if test != -2:
            print(f"For the {MAD_CONFORM[DIGS[test]]}:\n\
            - 0.0000 to {MAD_CONFORM[test][0]}: Close Conformity\n\
            - {MAD_CONFORM[test][0]} to {MAD_CONFORM[test][1]}: Acceptable Conformity\n\
            - {MAD_CONFORM[test][1]} to {MAD_CONFORM[test][2]}: Marginally Acceptable Conformity\n\
            - Above {MAD_CONFORM[test][2]}: Nonconformity")
        else:
            pass
    return mad


[docs]def mse(frame, verbose=True):
    """Computes the test's Mean Square Error

    Args:
        frame: DataFrame with the already computed Absolute Deviations between
            the found and expected proportions
        verbose: Prints the MSE. Defaults to True.

    Returns:
        Mean of the squared differences between the found and the expected proportions.
    """
    mse = (frame.AbsDif ** 2).mean()

    if verbose:
        print(f"\nMean Square Error = {mse}")

    return mse

def _bhattacharyya_coefficient(dist_1, dist_2):
    """Computes the Bhattacharyya Coeficient between two probability
    distributions, to be letar used to compute the Bhattacharyya Distance

    Args:
        dist_1 (np.array): The newly gathered distribution, to be compared
            with an older / established distribution.
        dist_2 (np.array): The older/ establhished distribution with which
            the new one will be compared. 
    
    Returns:
        bhat_coef (float)
    """
    return sqrt(dist_1 * dist_2).sum()


def _bhattacharyya_distance_(dist_1, dist_2):
    """Computes the Bhattacharyya Dsitance between two probability
    distributions

    Args:
        dist_1 (np.array): The newly gathered distribution, to be compared
            with an older / established distribution.
        dist_2 (np.array): The older/ establhished distribution with which
            the new one will be compared. 
    
    Returns:
        bhat_dist (float)
    """
    with errstate(divide='ignore'):
        bhat_dist =  -log(_bhattacharyya_coefficient(dist_1, dist_2))
    return bhat_dist


def _kullback_leibler_divergence_(dist_1, dist_2):
    """Computes the Kullback-Leibler Divergence between two probability
    distributions.

    Args:
        dist_1 (np.array): The newly gathered distribution, to be compared
            with an older / established distribution.
        dist_2 (np.array): The older/ establhished distribution with which
            the new one will be compared. 

    Returns:
        kulb_leib_diverg (float)        
    """
    # ignore divide by zero warning in np.where
    with errstate(divide='ignore'):
        kl_d = (log((dist_1 / dist_2), where=(dist_1 != 0)) * dist_1).sum()
    return kl_d