Source code for benford.benford

import warnings
from pandas import Series, DataFrame
from numpy import arange, log10, ones, abs, cos, sin, pi, mean
from .constants import CONFS, DIGS, SEC_ORDER_DIGS, REV_DIGS, TEST_NAMES, \
    MAD_CONFORM, CRIT_CHI2, CRIT_KS
from .checks import _check_digs_, _check_confidence_, _check_test_, \
    _check_num_array_, _check_high_Z_
from .utils import _set_N_, input_data, prepare, \
    subtract_sorted, prep_to_roll, mad_to_roll, mse_to_roll, \
    get_mantissas
from .expected import _get_expected_digits_ # First, Second, LastTwo
from .viz import _get_plot_args, plot_digs, plot_sum, plot_ordered_mantissas,\
    plot_mantissa_arc_test, plot_roll_mse, plot_roll_mad
from .reports import _inform_, _report_mad_, _report_test_, _deprecate_inform_,\
    _report_mantissa_
from .stats import Z_score, chi_sq, chi_sq_2, kolmogorov_smirnov,\
    kolmogorov_smirnov_2, _bhattacharyya_distance_, _bhattacharyya_coefficient,\
    _kullback_leibler_divergence_, _mantissas_ks_


[docs]class Base(DataFrame):
    """Internalizes and prepares the data for Analysis.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.`

    Raises:
        TypeError: if not receiving `int` or `float` as input.
    """

    def __init__(self, data, decimals, sign='all', sec_order=False):

        DataFrame.__init__(self, {'seq': data})

        if (self.seq.dtype != 'float') & (self.seq.dtype != 'int'):
            raise TypeError("The sequence dtype was neither int nor "
                            "float. Convert it to whether int of float, "
                            "and try again.")

        if sign == 'all':
            self.seq = self.seq.loc[self.seq != 0]
        elif sign == 'pos':
            self.seq = self.seq.loc[self.seq > 0]
        else:
            self.seq = self.seq.loc[self.seq < 0]

        self.dropna(inplace=True)

        ab = self.seq.abs()

        if self.seq.dtype == 'int':
            self['ZN'] = ab
        else:
            if decimals == 'infer':
                self['ZN'] = ab.astype(str).str\
                               .replace('.', '', regex=False)\
                               .str.lstrip('0')\
                               .str[:5].astype(int)
            else:
                self['ZN'] = (ab * (10 ** decimals)).astype(int)
        # First digits
        for col in ['F1D', 'F2D', 'F3D']:
            temp = self.ZN.loc[self.ZN >= 10 ** (REV_DIGS[col] - 1)]
            self[col] = (temp // 10 ** ((log10(temp).astype(int)) -
                                        (REV_DIGS[col] - 1)))
            # fill NANs with -1, which is a non-usable value for digits,
            # to be discarded later.
            self[col] = self[col].fillna(-1).astype(int)
        # Second digit
        temp_sd = self.loc[self.ZN >= 10]
        self['SD'] = (temp_sd.ZN // 10**((log10(temp_sd.ZN)).astype(int) -
                                         1)) % 10
        self['SD'] = self['SD'].fillna(-1).astype(int)
        # Last two digits
        temp_l2d = self.loc[self.ZN >= 1000]
        self['L2D'] = temp_l2d.ZN % 100
        self['L2D'] = self['L2D'].fillna(-1).astype(int)


[docs]class Test(DataFrame):
    """Transforms the original number sequence into a DataFrame reduced
    by the ocurrences of the chosen digits, creating other computed
    columns

    Args:
        base: The Base object with the data prepared for Analysis
        digs: Tells which test to perform: 1: first digit; 2: first two digits;
            3: furst three digits; 22: second digit; -2: last two digits.
        confidence (int, float): confidence level to draw lower and upper limits when
            plotting and to limit the top deviations to show.
        limit_N (int): sets a limit to N as the sample size for the calculation of
            the Z scores if the sample is too big. Defaults to None.

    Attributes:
        N: Number of records in the sample to consider in computations
        ddf: Degrees of Freedom to look up for the critical chi-square value
        chi_square: Chi-square statistic for the given test
        KS: Kolmogorov-Smirnov statistic for the given test
        MAD: Mean Absolute Deviation for the given test
        confidence: Confidence level to consider when setting some critical values
        digs (int): numerical representation of the test at hand. 1: F1D; 2: F2D;
            3: F3D; 22: SD; -2: L2D.
        sec_order (bool): True if the test is a Second Order one
    """

    def __init__(self, base, digs, confidence, limit_N=None, sec_order=False):
        # create a separated Expected distributions object
        super(Test, self).__init__(_get_expected_digits_(digs))
        # create column with occurrences of the digits in the base
        self['Counts'] = base[DIGS[digs]].value_counts()
        # create column with relative frequencies
        self['Found'] = base[DIGS[digs]].value_counts(normalize=True)
        self.fillna(0, inplace=True)
        # create column with absolute differences
        self['Dif'] = self.Found - self.Expected
        self['AbsDif'] = self.Dif.abs()
        self.limit_N = _set_N_(len(base), limit_N)
        self['Z_score'] = Z_score(self, self.limit_N)
        self.ddf = len(self) - 1
        self.chi_square = chi_sq_2(self)
        self.KS = kolmogorov_smirnov_2(self)
        self.MAD = self.AbsDif.mean()
        self.MSE = (self.AbsDif ** 2).mean()
        self.bhattacharyya_coefficient = _bhattacharyya_coefficient(
            self.Found.values, self.Expected.values)
        self.bhattacharyya_distance = _bhattacharyya_distance_(
            self.Found.values, self.Expected.values)
        self.kullback_leibler_divergence = _kullback_leibler_divergence_(
            self.Found.values, self.Expected.values)
        self.confidence = confidence
        self.digs = digs
        self.sec_order = sec_order

        if sec_order:
            self.name = TEST_NAMES[SEC_ORDER_DIGS[digs]]
        else:
            self.name = TEST_NAMES[DIGS[digs]]

[docs]    def update_confidence(self, new_conf, check=True):
        """Sets a new confidence level for the Benford object, so as to be used to
        produce critical values for the tests

        Args:
            new_conf: new confidence level to draw lower and upper limits when
                plotting and to limit the top deviations to show, as well as to
                calculate critical values for the tests' statistics.
            check: checks the value provided for the confidence. Defaults to True
        """
        if check:
            self.confidence = _check_confidence_(new_conf)
        else:
            self.confidence = new_conf

    @property
    def critical_values(self):
        """dict: a dictionary with the critical values for the test at hand,
            according to the current confidence level."""
        crit_ks = CRIT_KS[self.confidence] / (self.limit_N ** 0.5) if self.confidence\
            else None
        return {'Z': CONFS[self.confidence],
                'KS': crit_ks,
                'chi2': CRIT_CHI2[self.ddf][self.confidence],
                'MAD': MAD_CONFORM[self.digs]}

[docs]    def show_plot(self, save_plot=None, save_plot_kwargs=None):
        """Draws the test plot.
        
        Args:
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when save_plot is a string with the figure file
                path/name.
        """
        x, figsize, text_x = _get_plot_args(self.digs)
        plot_digs(self, x=x, y_Exp=self.Expected, y_Found=self.Found,
                    N=self.limit_N, figsize=figsize, conf_Z=CONFS[self.confidence],
                    text_x=text_x, save_plot=save_plot, save_plot_kwargs=save_plot_kwargs
                    )

[docs]    def report(self, high_Z='pos', show_plot=True,
               save_plot=None, save_plot_kwargs=None):
        """Handles the report especific to the test, considering its statistics
        and according to the current confidence level.

        Args:
            high_Z (int): chooses which Z scores to be used when displaying results,
                according to the confidence level chosen. Defaluts to 'pos',
                which will highlight only values higher than the expexted
                frequencies; 'all' will highlight both extremes (positive and
                negative); and an integer, which will use the first n entries,
                positive and negative, regardless of whether Z is higher than
                the critical value or not.
            show_plot: calls the show_plot method, to draw the test plot
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
        """
        high_Z = _check_high_Z_(high_Z)
        _report_test_(self, high_Z, self.critical_values)
        if show_plot:
            self.show_plot(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)


[docs]class Summ(DataFrame):
    """Gets the base object and outputs a Summation test object

    Args:
       base: The Base object with the data prepared for Analysis
       test: The test for which to compute the summation
    """

    def __init__(self, base, test):
        super(Summ, self).__init__(base.abs()
                                   .groupby(test)[['seq']]
                                   .sum())
        self['Percent'] = self.seq / self.seq.sum()
        self.columns.values[0] = 'Sum'
        self.expected = 1 / len(self)
        self['AbsDif'] = (self.Percent - self.expected).abs()
        self.index = self.index.astype(int)
        #: Mean Absolute Deviation for the test
        self.MAD = self.AbsDif.mean()
        self.MSE = (self.AbsDif ** 2).mean()
        #: Confidence level to consider when setting some critical values
        self.confidence = None
        # (int): numerical representation of the test at hand
        self.digs = REV_DIGS[test]
        # (str): the name of the Summation test.
        self.name = TEST_NAMES[f'{test}_Summ']

[docs]    def show_plot(self, save_plot=None, save_plot_kwargs=None):
        """Draws the Summation test plot
        
        Args:
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when save_plot is a string with the figure file
                path/name.
        """
        figsize=(2 * (self.digs ** 2 + 5), 1.5 * (self.digs ** 2 + 5))
        plot_sum(self, figsize, self.expected,
                 save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    
[docs]    def report(self, high_diff=None, show_plot=True,
               save_plot=None, save_plot_kwargs=None):
        """Gives the report on the Summation test.

        Args:
            high_diff: Number of records to show after ordering by the absolute
                differences between the found and the expected proportions
            show_plot: calls the show_plot method, to draw the Summation test plot
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
        """
        _report_test_(self, high_diff)
        if show_plot:
            self.show_plot(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)


[docs]class Mantissas:
    """Computes and holds the mantissas of the logarithms of the records

    Args:
        data: sequence to compute mantissas from. numpy 1D array, pandas
            Series of pandas DataFrame column.
        confidence: confidence level for computing the critical values to
            compare with some statistics
    """

    def __init__(self, data, confidence=95, limit_N=None):

        data = Series(_check_num_array_(data))
        data = data.dropna().loc[data != 0].abs()
        self.limit_N = _set_N_(len(data), limit_N)
        #: (DataFrame): pandas DataFrame with the mantissas
        self.data = DataFrame({'Mantissa': get_mantissas(data.abs())})
        self.confidence = confidence

    @property
    def stats(self):
        # (dict): Dictionary with the mantissas statistics
        ks, crit_ks = _mantissas_ks_(self.data.Mantissa.values,
                                     self.confidence, self.limit_N)
        return {'Mean': self.data.Mantissa.mean(),
                'Var': self.data.Mantissa.var(),
                'Skew': self.data.Mantissa.skew(),
                'Kurt': self.data.Mantissa.kurt(),
                'KS': ks,
                'KS_critical': crit_ks}


[docs]    def update_confidence(self, new_conf, check=True):
        """Sets a new confidence level for the Benford object, so as to be used to
        produce critical values for the tests

        Args:
            new_conf: new confidence level to draw lower and upper limits when
                plotting and to limit the top deviations to show, as well as to
                calculate critical values for the tests' statistics.
            check: checks the value provided for the confidence. Defaults to True
        """
        if check:
            self.confidence = _check_confidence_(new_conf)
        else:
            self.confidence = new_conf


[docs]    def report(self, show_plot=True, save_plot=None, save_plot_kwargs=None):
        """Displays the Mantissas test stats.

        Args:
            show_plot: shows the Ordered Mantissas plot and the Arc Test plot.
                Defaults to True.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
        """
        _report_mantissa_(self.stats, confidence=self.confidence)

        if show_plot:
            self.show_plot(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
            self.arc_test(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)

[docs]    def show_plot(self, figsize=(12, 6), save_plot=None, save_plot_kwargs=None):
        """Plots the ordered mantissas and a line with the expected
        inclination.

        Args:
            figsize (tuple): figure size dimensions
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when save_plot is a string with the figure file
                path/name.
        """
        plot_ordered_mantissas(self.data.Mantissa, figsize=figsize,
                               save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)

[docs]    def arc_test(self, grid=True, figsize=12,
                 save_plot=None, save_plot_kwargs=None):
        """Adds two columns to Mantissas's DataFrame equal to their "X" and "Y"
        coordinates, plots its to a scatter plot and calculates the gravity
        center of the circle.

        Args:
            grid: show grid of the plot. Defaluts to True.
            figsize (int): size of the figure to be displayed. Since it is a square,
                there is no need to provide a tuple, like is usually the case with
                matplotlib.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
        """
        self.data['mant_x'] = cos(2 * pi * self.data.Mantissa)
        self.data['mant_y'] = sin(2 * pi * self.data.Mantissa)
        self.gravity_center = (self.data.mant_x.mean(), self.data.mant_y.mean())

        plot_mantissa_arc_test(self.data, self.gravity_center,
                               grid=grid, figsize=figsize,
                               save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)


[docs]class Benford(object):
    """Initializes a Benford Analysis object and computes the proportions for
    the digits. The tets dataFrames are atributes, i.e., obj.F1D is the First
    Digit DataFrame, the obj.F2D,the First Two Digits one, and so one, F3D for
    First Three Digits, SD for Second  Digit and L2D for Last Two Digits.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a tuple with a pandas DataFrame and the name (str)
            of the chosen column. Values must be integers or floats.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.
        confidence (int, float): confidence level to draw lower and upper limits when
            plotting and to limit the top deviations to show, as well as to
            calculate critical values for the tests' statistics. Defaults to 95.
        mantissas (bool): opts for also running the mantissas Test. Defaulst to
            True
        sec_order: runs the Second Order tests, which are the Benford's tests
            performed on the differences between the ordered sample (a value minus
            the one before it, and so on). If the original series is Benford-
            compliant, this new sequence should aldo follow Beford. The Second
            Order can also be called separately, through the method sec_order().
        summation: creates the Summation DataFrames for the First, First Two, and
            First Three Digits. The summation tests can also be called separately,
            through the method summation().
        limit_N (int): sets a limit to N as the sample size for the calculation of
            the Z scores if the sample is too big. Defaults to None.
        verbose: gives some information about the data and the registries used
            and discarded for each test.

    Attributes:
        data: the raw data provided for the analysis
        chosen: the column of the DataFrame to be analysed or the data itself
        sign (str): which number sign(s) to include in the analysis
        confidence: current confidence level
        limit_N (int): sample size to use in computations
        verbose (bool): verbose or not
        base: the Base, pre-processed object
        tests (:obj:`list` of :obj:`str`): keeps track of the tests the
            instance has
    """

    def __init__(self, data, decimals=2, sign='all', confidence=95,
                 mantissas=True, sec_order=False, summation=False,
                 limit_N=None, verbose=True):
        self.data, self.chosen = input_data(data)
        self.decimals = decimals
        self.sign = sign
        self.confidence = _check_confidence_(confidence)
        self.limit_N = limit_N
        self.verbose = verbose
        self.base = Base(self.chosen, decimals, sign)
        self.tests = []

        # Create a DatFrame for each Test
        for key, val in DIGS.items():
            test = Test(self.base.loc[self.base[val] != -1],
                        digs=key, confidence=self.confidence,
                        limit_N=self.limit_N)
            setattr(self, val, test)
            self.tests.append(val)
        # dict with the numbers of discarded entries for each test column
        self._discarded = {key: val for (key, val) in
                           zip(DIGS.values(),
                               [len(self.base[col].loc[self.base[col] == -1])
                                for col in DIGS.values()])}

        if self.verbose:
            print('\n', ' Benford Object Instantiated '.center(50, '#'), '\n')
            print(f'Initial sample size: {len(self.chosen)}.\n')
            print(f'Test performed on {len(self.base)} registries.\n')
            print(
                f'Number of discarded entries for each test:\n{self._discarded}')

        if mantissas:
            self.mantissas()

        if sec_order:
            self.sec_order()

        if summation:
            self.summation()

[docs]    def update_confidence(self, new_conf, tests=None):
        """Sets (a) new confidence level(s) for the Benford object, so as to be
        used to produce critical values for the tests.

        Args:
            new_conf: new confidence level to draw lower and upper limits when
                plotting and to limit the top deviations to show, as well as to
                calculate critical values for the tests' statistics.
            tests (:obj:`list` of :obj:`str`): list of tests names (strings) to
                have their confidence updated. If only one, provide a one-element
                list, like ['F1D']. Defauts to None, in which case it will use
                the instance .test list attribute.

        Raises:
            ValueError: if the test argument is not a `list` or `None`.
        """
        self.confidence = _check_confidence_(new_conf)
        if tests is None:
            tests = self.tests
        else:
            if not isinstance(tests, list):
                raise ValueError('tests must be a list or None.')
        for test in tests:
            try:
                getattr(self, test).update_confidence(
                            self.confidence, check=False)
            except AttributeError as e:
                if test in ['F1D_Summ', 'F2D_Summ', 'F3D_Summ']:
                    pass
                else:
                    print(e,
                        f"\n\n{test} not in Benford instance tests - "
                        "review test's name.")

    @property
    def all_confidences(self):
        """dict: a dictionary with a confidence level for each computed tests,
        when applicable."""
        con_dic = {}
        for key in self.tests:
            try:
                con_dic[key] = getattr(self, key).confidence
            except AttributeError:
                continue
        return con_dic

[docs]    def mantissas(self):
        """Adds a Mantissas object to the tests, with all its statistics and
        plotting capabilities.
        """
        self.Mantissas = Mantissas(self.base.seq.values,
                                   self.confidence, self.limit_N)
        self.tests.append('Mantissas')
        if self.verbose:
            print('\nAdded Mantissas test.')

[docs]    def sec_order(self):
        """Runs the Second Order tests, which are the Benford's tests
        performed on the differences between the ordered sample (a value minus
        the one before it, and so on). If the original series is Benford-
        compliant, this new sequence should aldo follow Beford. The Second
        Order can also be called separately, through the method sec_order().
        """
        #: Base instance of the differences between the ordered sample
        self.base_sec = Base(subtract_sorted(self.chosen),
                             decimals=self.decimals, sign=self.sign)
        for key, val in DIGS.items():
            test = Test(self.base_sec.loc[self.base_sec[val] != -1],
                        digs=key, confidence=self.confidence,
                        limit_N=self.limit_N, sec_order=True)
            setattr(self, SEC_ORDER_DIGS[key], test)
            self.tests.append(f'{val}_sec')
            # No need to populate crit_vals dict, since they are the
            # same and do not depend on N
            self._discarded_sec = {key: val for (key, val) in zip(
                                   SEC_ORDER_DIGS.values(),
                                   [sum(self.base_sec[col] == -1) for col in
                                    DIGS.values()])}
        if self.verbose:
            print(f'\nSecond order tests run in {len(self.base_sec)} '
                  'registries.\n\nNumber of discarded entries for second order'
                  f' tests:\n{self._discarded_sec}')

[docs]    def summation(self):
        """Creates Summation test DataFrames from Base object"""
        for test in ['F1D', 'F2D', 'F3D']:
            t = f'{test}_Summ'
            setattr(self, t, Summ(self.base, test))
            self.tests.append(t)

        if self.verbose:
            print('\nAdded Summation DataFrames to F1D, F2D and F3D Tests.')


[docs]class Source(DataFrame):
    """Prepares the data for Analysis. pandas DataFrame subclass.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.
        sec_order: choice for the Second Order Test, which cumputes the
            differences between the ordered entries before running the Tests.
        verbose (bool): tells the number of registries that are being subjected to
            the analysis; defaults to True.

    Raises:
        ValueError: if the `sign` arg is not in ['all', 'pos', 'neg']
        TypeError: if not receiving `int` or `float` as input.
    """

    def __init__(self, data, decimals=2, sign='all', sec_order=False,
                 verbose=True, inform=None):

        if sign not in ['all', 'pos', 'neg']:
            raise ValueError("The -sign- argument must be "
                             "'all','pos' or 'neg'.")

        DataFrame.__init__(self, {'seq': data})

        if self.seq.dtype != 'float' and self.seq.dtype != 'int':
            raise TypeError('The sequence dtype was neither int nor float.\n'
                            'Convert it to whether int or float, and try again.')

        if sign == 'pos':
            self.seq = self.seq.loc[self.seq > 0]
        elif sign == 'neg':
            self.seq = self.seq.loc[self.seq < 0]
        else:
            self.seq = self.seq.loc[self.seq != 0]

        self.dropna(inplace=True)
        #: (bool): verbose or not
        self.verbose = _deprecate_inform_(verbose, inform)
        if self.verbose:
            print(f"\nInitialized sequence with {len(self)} registries.")

        if sec_order:
            self.seq = subtract_sorted(self.seq.copy())
            self.dropna(inplace=True)
            self.reset_index(inplace=True)
            if verbose:
                print('Second Order Test. Initial series reduced '
                      f'to {len(self.seq)} entries.')

        ab = self.seq.abs()

        if self.seq.dtype == 'int':
            self['ZN'] = ab
        else:
            if decimals == 'infer':
                # There is some numerical issue with Windows that required
                # implementing it differently (and slower)
                self['ZN'] = ab.astype(str)\
                               .str.replace('.', '', regex=False)\
                               .str.lstrip('0').str[:5]\
                               .astype(int)
            else:
                self['ZN'] = (ab * (10 ** decimals)).astype(int)

[docs]    def mantissas(self, report=True, show_plot=True, figsize=(15, 8),
                  save_plot=None, save_plot_kwargs=None):
        """Calculates the mantissas, their mean and variance, and compares them
        with the mean and variance of a Benford's sequence.

        Args:
            report: prints the mamtissas mean, variance, skewness and kurtosis
                for the sequence studied, along with reference values.
            show_plot: plots the ordered mantissas and a line with the expected
                inclination. Defaults to True.
            figsize: tuple that sets the figure dimensions.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
        """
        self['Mant'] = get_mantissas(self.seq.abs())
        if report:
            p = self[['seq', 'Mant']]
            p = p.loc[p.seq > 0].sort_values('Mant')
            print(f"The Mantissas MEAN is {p.Mant.mean()}. Ref: 0.5.")
            print(f"The Mantissas VARIANCE is {p.Mant.var()}. Ref: 0.083333.")
            print(f"The Mantissas SKEWNESS is {p.Mant.skew()}. \tRef: 0.")
            print(f"The Mantissas KURTOSIS is {p.Mant.kurt()}. \tRef: -1.2.")

        if show_plot:
            plot_ordered_mantissas(self.Mant, figsize=figsize,
                                   save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)

[docs]    def first_digits(self, digs, confidence=None, high_Z='pos',
                     limit_N=None, MAD=False, MSE=False, chi_square=False,
                     KS=False, show_plot=True, save_plot=None, save_plot_kwargs=None,
                     simple=False, bhat_coeff = False, bhat_dist=False,
                     kl_diverg=False, ret_df=False):
        """Performs the Benford First Digits test with the series of
        numbers provided, and populates the mapping dict for future
        selection of the original series.

        Args:
            digs (int): number of first digits to consider. Must be 1 (first digit),
                2 (first two digits) or 3 (first three digits).
            verbose (bool): tells the number of registries that are being subjected to
                the analysis; defaults to True
            confidence (int, float): confidence level to draw lower and upper limits when
                plotting and to limit the top deviations to show, as well as to
                calculate critical values for the tests' statistics. Defaults to None.
            high_Z (int): chooses which Z scores to be used when displaying results,
                according to the confidence level chosen. Defaluts to 'pos',
                which will highlight only values higher than the expexted
                frequencies; 'all' will highlight both extremes (positive and
                negative); and an integer, which will use the first n entries,
                positive and negative, regardless of whether Z is higher than
                the confidence or not.
            limit_N (int): sets a limit to N as the sample size for the calculation of
                the Z scores if the sample is too big. Defaults to None.
            MAD (bool): calculates the Mean Absolute Difference between the
                found and the expected distributions; defaults to False.
            MSE (bool): calculates the Mean Square Error of the sample; defaults to
                False.
            bhat_coeff (bool): computes the Bhattacharyya Coefficient between
                the found and the expected (Benford) digits distribution; defaults
                to Fasle
            bhat_dist (bool): calculates the Bhattacharyya Distance between
                the found and the expected (Benford) digits distribution; defaults
                to Fasle
            kl_diverg (bool): calculates the Kulback-Laibler Divergence between
                the found and the expected (Benford) digits distribution;
                defaults to False
            show_plot (bool): draws the test plot. Defaults to True.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
            ret_df: returns the test DataFrame. Defaults to False. True if run by
                the test function.

        Returns:
            DataFrame with the Expected and Found proportions, and the Z scores of
                the differences
        """
        # Check on the possible values for confidence levels
        confidence = _check_confidence_(confidence)
        # Check on possible digits
        _check_digs_(digs)

        temp = self.loc[self.ZN >= 10 ** (digs - 1)]
        temp[DIGS[digs]] = (temp.ZN // 10 ** ((log10(temp.ZN).astype(
                                                   int)) - (digs - 1))).astype(
                                                       int)
        n, m = 10 ** (digs - 1), 10 ** (digs)
        x = arange(n, m)

        if simple:
            self.verbose = False
            show_plot = False
            df = prepare(temp[DIGS[digs]], digs, limit_N=limit_N,
                         simple=True)
        else:
            N, df = prepare(temp[DIGS[digs]], digs, limit_N=limit_N,
                            simple=False)

        if self.verbose:
            print(f"\nTest performed on {len(temp)} registries.\n"
                  f"Discarded {len(self) - len(temp)} records < {10 ** (digs - 1)}"
                  " after preparation.")
            if confidence is not None:
                _inform_(df, high_Z=high_Z, conf=CONFS[confidence])

        # Mean absolute difference
        if MAD:
            self.MAD = df.AbsDif.mean()
            if self.verbose:
                _report_mad_(digs, self.MAD)

        # Mean Square Error
        if MSE:
            self.MSE = (df.AbsDif ** 2).mean()

        # Chi-square statistic
        if chi_square:
            self.chi_square = chi_sq(df, ddf=len(df) - 1,
                                     confidence=confidence,
                                     verbose=self.verbose)
        # KS test
        if KS:
            self.KS = kolmogorov_smirnov(df, confidence=confidence, N=len(temp),
                                         verbose=self.verbose)

        if bhat_coeff:
            self.bhat_coeff = _bhattacharyya_coefficient(
                                df.Found.values, df.Expected.values)

        if bhat_dist:
            self.bhat_dist = _bhattacharyya_distance_(
                                df.Found.values, df.Expected.values)
        
        if kl_diverg:
            self.kl_diverg = _kullback_leibler_divergence_(
                                df.Found.values, df.Expected.values)

        # Plotting the expected frequncies (line) against the found ones(bars)
        if show_plot:
            plot_digs(df, x=x, y_Exp=df.Expected, y_Found=df.Found, N=N,
                       figsize=(2 * (digs ** 2 + 5), 1.5 * (digs ** 2 + 5)),
                       conf_Z=CONFS[confidence], save_plot=save_plot,
                       save_plot_kwargs=save_plot_kwargs)
        if ret_df:
            return df

[docs]    def second_digit(self, confidence=None, high_Z='pos',
                     limit_N=None, MAD=False, MSE=False, chi_square=False,
                     KS=False, bhat_coeff=False, bhat_dist=False, kl_diverg=False,
                     show_plot=True, save_plot=None, save_plot_kwargs=None,
                     simple=False, ret_df=False):
        """Performs the Benford Second Digit test with the series of
        numbers provided.

        Args:
            verbose (bool): tells the number of registries that are being subjected to
                the analysis; defaults to True
            MAD (bool): calculates the Mean Absolute Difference between the
                found and the expected distributions; defaults to False.
            confidence (int, float): confidence level to draw lower and upper limits when
                plotting and to limit the top deviations to show, as well as to
                calculate critical values for the tests' statistics. Defaults to None.
            high_Z (int): chooses which Z scores to be used when displaying results,
                according to the confidence level chosen. Defaluts to 'pos',
                which will highlight only values higher than the expexted
                frequencies; 'all' will highlight both extremes (positive and
                negative); and an integer, which will use the first n entries,
                positive and negative, regardless of whether Z is higher than
                the confidence or not.
            limit_N (int): sets a limit to N as the sample size for the calculation of
                the Z scores if the sample is too big. Defaults to None.
            MSE (bool): calculates the Mean Square Error of the sample; defaults to
                False.
            bhat_coeff (bool): computes the Bhattacharyya Coefficient between
                the found and the expected (Benford) digits distribution; defaults
                to Fasle
            bhat_dist (bool): calculates the Bhattacharyya Distance between
                the found and the expected (Benford) digits distribution; defaults
                to Fasle
            kl_diverg (bool): calculates the Kulback-Laibler Divergence between
                the found and the expected (Benford) digits distribution;
                defaults to False
            show_plot (bool): draws the test plot.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
            ret_df: returns the test DataFrame. Defaults to False. True if run by
                the test function.

        Returns:
            DataFrame with the Expected and Found proportions, and the Z scores of
                the differences
        """
        confidence = _check_confidence_(confidence)

        conf = CONFS[confidence]

        temp = self.loc[self.ZN >= 10, :]
        temp['SD'] = (temp.ZN // 10 ** ((log10(temp.ZN)).astype(
                      int) - 1)) % 10

        if simple:
            self.verbose = False
            show_plot = False
            df = prepare(temp['SD'], 22, limit_N=limit_N, simple=True)
        else:
            N, df = prepare(temp['SD'], 22, limit_N=limit_N, simple=False)

        if self.verbose:
            print(f"\nTest performed on {len(temp)} registries.\nDiscarded "
                  f"{len(self) - len(temp)} records < 10 after preparation.")
            if confidence is not None:
                _inform_(df, high_Z, conf)

        # Mean absolute difference
        if MAD:
            self.MAD = df.AbsDif.mean()
            if self.verbose:
                _report_mad_(22, self.MAD)
        # Mean Square Error
        if MSE:
            self.MSE = (df.AbsDif ** 2).mean()

        # Chi-square statistic
        if chi_square:
            self.chi_square = chi_sq(df, ddf=9, confidence=confidence,
                                     verbose=self.verbose)
        # KS test
        if KS:
            self.KS = kolmogorov_smirnov(df, confidence=confidence, N=len(temp),

                                         verbose=self.verbose)
        if bhat_coeff:
            self.bhat_coeff = _bhattacharyya_coefficient(
                                df.Found.values, df.Expected.values)

        if bhat_dist:
            self.bhat_dist = _bhattacharyya_distance_(
                                df.Found.values, df.Expected.values
                            )
        
        if kl_diverg:
            self.kl_diverg = _kullback_leibler_divergence_(
                                df.Found.values, df.Expected.values
                            )

        # Plotting the expected frequncies (line) against the found ones(bars)
        if show_plot:
            plot_digs(df, x=arange(0, 10), y_Exp=df.Expected,
                       y_Found=df.Found, N=N, figsize=(10, 6), conf_Z=conf,
                       save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
        if ret_df:
            return df

[docs]    def last_two_digits(self, confidence=None, high_Z='pos',
                        limit_N=None, MAD=False, MSE=False, chi_square=False,
                        KS=False, bhat_coeff=False, bhat_dist=False, kl_diverg=False,
                        show_plot=True, save_plot=None, save_plot_kwargs=None,
                        simple=False, ret_df=False):
        """Performs the Benford Last Two Digits test with the series of
        numbers provided.

        Args:
            verbose (bool): tells the number of registries that are being subjected to
                the analysis; defaults to True
            MAD (bool): calculates the Mean Absolute Difference between the
                found and the expected distributions; defaults to False.
            confidence (int, float): confidence level to draw lower and upper limits when
                plotting and to limit the top deviations to show, as well as to
                calculate critical values for the tests' statistics. Defaults to None.
            high_Z (int): chooses which Z scores to be used when displaying results,
                according to the confidence level chosen. Defaluts to 'pos',
                which will highlight only values higher than the expexted
                frequencies; 'all' will highlight both extremes (positive and
                negative); and an integer, which will use the first n entries,
                positive and negative, regardless of whether Z is higher than
                the confidence or not.
            limit_N (int): sets a limit to N as the sample size for the calculation of
                the Z scores if the sample is too big. Defaults to None.
            MSE (bool): calculates the Mean Square Error of the sample; defaults to
                False.
            bhat_coeff (bool): computes the Bhattacharyya Coefficient between
                the found and the expected (Benford) digits distribution; defaults
                to Fasle
            bhat_dist (bool): calculates the Bhattacharyya Distance between
                the found and the expected (Benford) digits distribution; defaults
                to Fasle
            kl_diverg (bool): calculates the Kulback-Laibler Divergence between
                the found and the expected (Benford) digits distribution;
                defaults to False
            show_plot (bool): draws the test plot.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
        
        Returns:
            DataFrame with the Expected and Found proportions, and the Z scores of
                the differences
        """
        confidence = _check_confidence_(confidence)
        conf = CONFS[confidence]

        temp = self.loc[self.ZN >= 1000]
        temp['L2D'] = temp.ZN % 100

        if simple:
            self.verbose = False
            show_plot = False
            df = prepare(temp['L2D'], -2, limit_N=limit_N, simple=True)
        else:
            N, df = prepare(temp['L2D'], -2, limit_N=limit_N, simple=False)

        if self.verbose:
            print(f"\nTest performed on {len(temp)} registries.\n\nDiscarded "
                  f"{len(self) - len(temp)} records < 1000 after preparation")
            if confidence is not None:
                _inform_(df, high_Z, conf)

        # Mean absolute difference
        if MAD:
            self.MAD = df.AbsDif.mean()
            if self.verbose:
                _report_mad_(-2, self.MAD)
        # Mean Square Error
        if MSE:
            self.MSE = (df.AbsDif ** 2).mean()

        # Chi-square statistic
        if chi_square:
            self.chi_square = chi_sq(df, ddf=99, confidence=confidence,
                                     verbose=self.verbose)
        # KS test
        if KS:
            self.KS = kolmogorov_smirnov(df, confidence=confidence, N=len(temp),
                                         verbose=self.verbose)

        if bhat_coeff:
            self.bhat_coeff = _bhattacharyya_coefficient(
                                df.Found.values, df.Expected.values)

        if bhat_dist:
            self.bhat_dist = _bhattacharyya_distance_(
                                df.Found.values, df.Expected.values)
        
        if kl_diverg:
            self.kl_diverg = _kullback_leibler_divergence_(
                                df.Found.values, df.Expected.values)

        # Plotting expected frequencies (line) versus found ones (bars)
        if show_plot:
            plot_digs(df, x=arange(0, 100), y_Exp=df.Expected,
                       y_Found=df.Found, N=N, figsize=(15, 5),
                       conf_Z=conf, text_x=True, save_plot=save_plot,
                       save_plot_kwargs=save_plot_kwargs)
        if ret_df:
            return df

[docs]    def summation(self, digs=2, top=20, show_plot=True, save_plot=None,
                  save_plot_kwargs=None, ret_df=False):
        """Performs the Summation test. In a Benford series, the sums of the
        entries begining with the same digits tends to be the same.

        Args:
            digs: tells the first digits to use. 1- first; 2- first two;
                3- first three. Defaults to 2.
            top: choses how many top values to show. Defaults to 20.
            show_plot: plots the results. Defaults to True.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension. Only available when
                plot=True.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when plot=True and save_plot is a string with the
                figure file path/name.
        
        Returns:
            DataFrame with the Expected and Found proportions, and their
                absolute differences
        """
        _check_digs_(digs)

        if digs == 1:
            top = 9
        # Call the dict for F1D, F2D, F3D
        d = DIGS[digs]
        if d not in self.columns:
            self[d] = self.ZN.astype(str).str[:digs].astype(int)
        # Call the expected proportion according to digs
        li = 1. / (9 * (10 ** (digs - 1)))

        df = self.groupby(d).sum()
        # s.drop(0, inplace=True)
        df['Percent'] = df.ZN / df.ZN.sum()
        df.columns.values[1] = 'Summ'
        df = df[['Summ', 'Percent']]
        df['AbsDif'] = (df.Percent - li).abs()

        if self.verbose:
            # N = len(self)
            print(f"\nTest performed on {len(self)} registries.\n")
            print(f"The top {top} diferences are:\n")
            print(df[:top])

        if show_plot:
            plot_sum(df, figsize=(
                       2 * (digs ** 2 + 5), 1.5 * (digs ** 2 + 5)), li=li,
                       save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)

        if ret_df:
            return df

[docs]    def duplicates(self, top_Rep=20, inform=None):
        """Performs a duplicates test and maps the duplicates count in descending
        order.

        Args:
            verbose (bool): tells how many duplicated entries were found and prints the
                top numbers according to the top_Rep argument. Defaluts to True.
            top_Rep: int or None. Chooses how many duplicated entries will be
                shown withe the top repititions. Defaluts to 20. If None, returns
                al the ordered repetitions.

        Returns:
            DataFrame with the duplicated records and their occurrence counts,
                in descending order (if verbose is False; if True, prints to
                terminal).

        Raises:
            ValueError: if the `top_Rep` arg is not int or None.
        """
        if top_Rep is not None and not isinstance(top_Rep, int):
            raise ValueError('The top_Rep argument must be an int or None.')

        dup = self[['seq']][self.seq.duplicated(keep=False)]
        dup_count = dup.groupby(self.seq).count()

        dup_count.index.names = ['Entries']
        dup_count.rename(columns={'seq': 'Count'}, inplace=True)

        dup_count.sort_values('Count', ascending=False, inplace=True)

        # self.maps['dup'] = dup_count.index[:top_Rep].values  # array

        if self.verbose:
            print(f'\nFound {len(dup_count)} duplicated entries.\n'
                  f'The entries with the {top_Rep} highest repitition counts are:')
            print(dup_count.head(top_Rep))
        else:
            return dup_count

[docs]class Roll_mad(object):
    """Applies the MAD to sequential subsets of the Series, returning another
    Series.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: tells which test to use. 1: Fisrt Digits; 2: First Two Digits;
            3: First Three Digits; 22: Second Digit; and -2: Last Two Digits.
        window: size of the subset to be used.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.

    """

    def __init__(self, data, test, window, decimals=2, sign='all'):

        #: the test (F1D, SD, F2D...) used for the MAD calculation and critical values
        self.test = _check_test_(test)

        if not isinstance(data, Source):
            data = Source(data, sign=sign, decimals=decimals, verbose=False)

        Exp, ind = prep_to_roll(data, self.test)

        self.roll_series = data[DIGS[test]].rolling(
                                window=window).apply(mad_to_roll, 
                                    args=(Exp, ind), raw=False)
        self.roll_series.dropna(inplace=True)

[docs]    def show_plot(self, figsize=(15, 8), save_plot=None, save_plot_kwargs=None):
        """Shows the rolling MAD plot

        Args:
            figsize: the figure dimensions.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when save_plot is a string with the figure file
                path/name.
        """
        plot_roll_mad(self, figsize=figsize,
                      save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)


[docs]class Roll_mse(object):
    """Applies the MSE to sequential subsets of the Series, returning another
    Series.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: tells which test to use. 1: Fisrt Digits; 2: First Two Digits;
            3: First Three Digits; 22: Second Digit; and -2: Last Two Digits.
        window: size of the subset to be used.
            decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. 'pos': only the positive
            entries; 'neg': only negative entries; 'all': all entries but zeros.
            Defaults to 'all'.
    """

    def __init__(self, data, test, window, decimals=2, sign='all'):

        test = _check_test_(test)

        if not isinstance(data, Source):
            data = Source(data, sign=sign, decimals=decimals, verbose=False)

        Exp, ind = prep_to_roll(data, test)

        self.roll_series = data[DIGS[test]].rolling(
                                window=window).apply(mse_to_roll, 
                                    args=(Exp, ind), raw=False)
        self.roll_series.dropna(inplace=True)

[docs]    def show_plot(self, figsize=(15, 8), save_plot=None, save_plot_kwargs=None):
        """Shows the rolling MSE plot

        Args:
            figsize: the figure dimensions.
            save_plot (str): string with the path/name of the file in which the generated
                plot will be saved. Uses matplotlib.pyplot.savefig(). File format
                is infered by the file name extension.
            save_plot_kwargs (dict): any of the kwargs accepted by
                matplotlib.pyplot.savefig()
                https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
                Only available when save_plot is a string with the figure file
                path/name.
        """
        plot_roll_mse(self.roll_series, figsize=figsize,
                      save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)


[docs]def first_digits(data, digs, decimals=2, sign='all', verbose=True,
                 confidence=None, high_Z='pos', limit_N=None,
                 MAD=False, MSE=False, chi_square=False, KS=False,
                 show_plot=True, save_plot=None, save_plot_kwargs=None,
                 inform=None):
    """Performs the Benford First Digits test on the series of
    numbers provided.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. 'pos': only the positive
            entries; 'neg': only negative entries; 'all': all entries but zeros.
            Defaults to 'all'.
        digs (int): number of first digits to consider. Must be 1 (first digit),
            2 (first two digits) or 3 (first three digits).
        verbose (bool): tells the number of registries that are being subjected to
            the analysis and returns tha analysis DataFrame sorted by the
            highest Z score down. Defaults to True.
        MAD (bool): calculates the Mean Absolute Difference between the
            found and the expected distributions; defaults to False.
        confidence (int, float): confidence level to draw lower and upper limits when
            plotting and to limit the top deviations to show. Defaults to None.
        high_Z (int): chooses which Z scores to be used when displaying results,
            according to the confidence level chosen. Defaluts to 'pos',
            which will highlight only values higher than the expexted
            frequencies; 'all' will highlight both extremes (positive and
            negative); and an integer, which will use the first n entries,
            positive and negative, regardless of whether Z is higher than
            the confidence or not.
        limit_N (int): sets a limit to N as the sample size for the calculation of
            the Z scores if the sample is too big. Defaults to None.
        MSE (bool): calculates the Mean Square Error of the sample; defaults to
            False.
        chi_square: calculates the chi_square statistic of the sample and
            compares it with a critical value, according to the confidence
            level chosen and the series's degrees of freedom. Defaults to
            False. Requires confidence != None.
        KS: calculates the Kolmogorov-Smirnov test, comparing the cumulative
            distribution of the sample with the Benford's, according to the
            confidence level chosen. Defaults to False. Requires confidence
            != None.
        show_plot (bool): draws the test plot.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.
    
    Returns:
        DataFrame with the Expected and Found proportions, and the Z scores of
            the differences if the confidence is not None.
    """
    verbose = _deprecate_inform_(verbose, inform)

    if not isinstance(data, Source):
        data = Source(data, decimals=decimals, sign=sign, verbose=verbose)

    data = data.first_digits(digs, confidence=confidence, high_Z=high_Z,
                             limit_N=limit_N, MAD=MAD, MSE=MSE,
                             chi_square=chi_square, KS=KS, show_plot=show_plot,
                             save_plot=save_plot, save_plot_kwargs=save_plot_kwargs,
                             ret_df=True)

    if confidence is not None:
        data = data[['Counts', 'Found', 'Expected', 'Z_score']]
        return data.sort_values('Z_score', ascending=False)
    else:
        return data[['Counts', 'Found', 'Expected']]


[docs]def second_digit(data, decimals=2, sign='all', verbose=True,
                 confidence=None, high_Z='pos', limit_N=None,
                 MAD=False, MSE=False, chi_square=False, KS=False,
                 show_plot=True, save_plot=None, save_plot_kwargs=None,
                 inform=None):
    """Performs the Benford Second Digits test on the series of
    numbers provided.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. 'pos': only the positive
            entries; 'neg': only negative entries; 'all': all entries but zeros.
            Defaults to 'all'.
        verbose (bool): tells the number of registries that are being subjected to
            the analysis and returns tha analysis DataFrame sorted by the
            highest Z score down. Defaults to True.
        MAD (bool): calculates the Mean Absolute Difference between the
            found and the expected distributions; defaults to False.
        confidence (int, float): confidence level to draw lower and upper limits when
            plotting and to limit the top deviations to show. Defaults to None.
        high_Z (int): chooses which Z scores to be used when displaying results,
            according to the confidence level chosen. Defaluts to 'pos',
            which will highlight only values higher than the expexted
            frequencies; 'all' will highlight both extremes (positive and
            negative); and an integer, which will use the first n entries,
            positive and negative, regardless of whether Z is higher than
            the confidence or not.
        limit_N (int): sets a limit to N as the sample size for the calculation of
            the Z scores if the sample is too big. Defaults to None.
        MSE (bool): calculates the Mean Square Error of the sample; defaults to
            False.
        chi_square: calculates the chi_square statistic of the sample and
            compares it with a critical value, according to the confidence
            level chosen and the series's degrees of freedom. Defaults to
            False. Requires confidence != None.
        KS: calculates the Kolmogorov-Smirnov test, comparing the cumulative
            distribution of the sample with the Benford's, according to the
            confidence level chosen. Defaults to False. Requires confidence
            != None.
        show_plot (bool): draws the test plot.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.

    Returns:
        DataFrame with the Expected and Found proportions, and the Z scores of
            the differences if the confidence is not None.
    """
    verbose = _deprecate_inform_(verbose, inform)

    if not isinstance(data, Source):
        data = Source(data, sign=sign, decimals=decimals, verbose=verbose)

    data = data.second_digit(confidence=confidence, high_Z=high_Z,
                             limit_N=limit_N, MAD=MAD, MSE=MSE,
                             chi_square=chi_square, KS=KS, show_plot=show_plot,
                             save_plot=save_plot, save_plot_kwargs=save_plot_kwargs,
                             ret_df=True)
    if confidence is not None:
        data = data[['Counts', 'Found', 'Expected', 'Z_score']]
        return data.sort_values('Z_score', ascending=False)
    else:
        return data[['Counts', 'Found', 'Expected']]


[docs]def last_two_digits(data, decimals=2, sign='all', verbose=True,
                    confidence=None, high_Z='pos', limit_N=None,
                    MAD=False, MSE=False, chi_square=False, KS=False,
                    show_plot=True, save_plot=None, save_plot_kwargs=None,
                    inform=None):
    """Performs the Last Two Digits test on the series of
    numbers provided.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column,with values being
            integers or floats.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. 'pos': only the positive
            entries; 'neg': only negative entries; 'all': all entries but zeros.
            Defaults to 'all'.
        verbose (bool): tells the number of registries that are being subjected to
            the analysis and returns tha analysis DataFrame sorted by the
            highest Z score down. Defaults to True.
        confidence (int, float): confidence level to draw lower and upper limits when
            plotting and to limit the top deviations to show. Defaults to None.
        high_Z (int): chooses which Z scores to be used when displaying results,
            according to the confidence level chosen. Defaluts to 'pos',
            which will highlight only values higher than the expexted
            frequencies; 'all' will highlight both extremes (positive and
            negative); and an integer, which will use the first n entries,
            positive and negative, regardless of whether Z is higher than
            the confidence or not.
        limit_N (int): sets a limit to N as the sample size for the calculation of
            the Z scores if the sample is too big. Defaults to None.
        MAD (bool): calculates the Mean Absolute Difference between the
            found and the expected distributions; defaults to False.
        MSE (bool): calculates the Mean Square Error of the sample; defaults to
            False.
        chi_square: calculates the chi_square statistic of the sample and
            compares it with a critical value, according to the confidence
            level chosen and the series's degrees of freedom. Defaults to
            False. Requires confidence != None.
        KS: calculates the Kolmogorov-Smirnov test, comparing the cumulative
            distribution of the sample with the Benford's, according to the
            confidence level chosen. Defaults to False. Requires confidence
            != None.
        show_plot (bool): draws the test plot.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.

    Returns:
        DataFrame with the Expected and Found proportions, and the Z scores of
            the differences if the confidence is not None.
    """
    verbose = _deprecate_inform_(verbose, inform)

    if not isinstance(data, Source):
        data = Source(data, decimals=decimals, sign=sign, verbose=verbose)

    data = data.last_two_digits(confidence=confidence, high_Z=high_Z,
                                limit_N=limit_N, MAD=MAD,
                                MSE=MSE, chi_square=chi_square, KS=KS,
                                show_plot=show_plot, save_plot=save_plot,
                                save_plot_kwargs=save_plot_kwargs, ret_df=True)

    if confidence is not None:
        data = data[['Counts', 'Found', 'Expected', 'Z_score']]
        return data.sort_values('Z_score', ascending=False)
    else:
        return data[['Counts', 'Found', 'Expected']]


[docs]def mantissas(data, report=True, show_plot=True, arc_test=True,
              save_plot=None, save_plot_kwargs=None, inform=None):
    """Extraxts the mantissas of the records logarithms

    Args:
        data: sequence to compute mantissas from, numpy 1D array, pandas Series
            of pandas DataFrame column.
        report: prints the mamtissas mean, variance, skewness and kurtosis
            for the sequence studied, along with reference values.
        show_plot: plots the ordered mantissas and a line with the expected
            inclination. Defaults to True.
        arc_test: draws the Arc Test plot. Defaluts to True.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.
    
    Returns:
        Series with the data mantissas.
    """
    report = _deprecate_inform_(report, inform)

    mant = Mantissas(data)
    if report:
        mant.report(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    if show_plot:
        mant.show_plot(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    if arc_test:
        mant.arc_test(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    return mant


[docs]def summation(data, digs=2, decimals=2, sign='all', top=20, verbose=True,
              show_plot=True, save_plot=None, save_plot_kwargs=None, inform=None):
    """Performs the Summation test. In a Benford series, the sums of the
    entries begining with the same digits tends to be the same.
    Works only with the First Digits (1, 2 or 3) test.

    Args:
        digs: tells the first digits to use: 1- first; 2- first two;
            3- first three. Defaults to 2.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        top: choses how many top values to show. Defaults to 20.
        show_plot: plots the results. Defaults to True.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.
    
    Returns:
        DataFrame with the Summation test, whether sorted in descending order
            (if verbose == True) or not.
    """
    verbose = _deprecate_inform_(verbose, inform)

    if not isinstance(data, Source):
        data = Source(data, sign=sign, decimals=decimals, verbose=verbose)

    data = data.summation(digs=digs, top=top,
                          show_plot=show_plot, save_plot=save_plot,
                          save_plot_kwargs=save_plot_kwargs, ret_df=True)
    if verbose:
        return data.sort_values('AbsDif', ascending=False)
    else:
        return data


[docs]def mad(data, test, decimals=2, sign='all', verbose=False):
    """Calculates the Mean Absolute Deviation of the Series

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: informs which base test to use for the mad.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.

    Returns:
        float: the Mean Absolute Deviation of the Series
    """
    data = _check_num_array_(data)
    test = _check_test_(test)
    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
    if test in [1, 2, 3]:
        start.first_digits(digs=test, MAD=True, MSE=True, simple=True)
    elif test == 22:
        start.second_digit(MAD=True, MSE=False, simple=True)
    else:
        start.last_two_digits(MAD=True, MSE=False, simple=True)
    return start.MAD


[docs]def mse(data, test, decimals=2, sign='all', verbose=False):
    """Calculates the Mean Squared Error of the Series

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: informs which base test to use for the mad.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.

    Returns:
        float: the Mean Squared Error of the Series
    """
    data = _check_num_array_(data)
    test = _check_test_(test)
    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
    if test in [1, 2, 3]:
        start.first_digits(digs=test, MAD=False, MSE=True, simple=True)
    elif test == 22:
        start.second_digit(MAD=False, MSE=True, simple=True)
    else:
        start.last_two_digits(MAD=False, MSE=True, simple=True)
    return start.MSE


[docs]def bhattacharyya_distance(data, test, decimals, sign="all", verbose=False):
    """Computes the Bhattacharyya Distance between the Found and the Expected
    (Benford) digits distributions, according toe the test chosen
    (First, Second, First Two...)

    Args:
        data (ndarray, Series): sequence to be evaluated, with values being
            integers or floats.
        test (int, str): informs which base test to be used.
        decimals (int): number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign (str, optional): tells which portion of the data to consider.
            pos: only the positive entries; neg: only negative entries; all:
            all entries but zeros. Defaults to "all".

    Returns:
        float: the Bhattacharyya Distance between the distributions
    """
    data = _check_num_array_(data)
    test = _check_test_(test)
    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
    if test in [1, 2, 3]:
        start.first_digits(digs=test, MAD=False, bhat_dist=True, simple=True)
    elif test == 22:
        start.second_digit(MAD=False, bhat_dist=True, simple=True)
    else:
        start.last_two_digits(MAD=False, bhat_dist=True, simple=True)
    return start.bhat_dist


[docs]def kullback_leibler_divergence(data, test, decimals, sign="all",
                                verbose=False):
    """Computes the Kulback-Leibler Divergence between the Found and the
    Expected (Benford) digits distributions, according toe the test chosen
    (First, Second, First Two...).

    Args:
        data (ndarray, Series): sequence to be evaluated, with values being
            integers or floats.
        test (int, str): informs which base test to be used.
        decimals (int): number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign (str, optional): tells which portion of the data to consider.
            pos: only the positive entries; neg: only negative entries; all:
            all entries but zeros. Defaults to "all".

    Returns:
        float: the Kulback-Leibler Divergence between the distributions
    """
    data = _check_num_array_(data)
    test = _check_test_(test)
    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
    if test in [1, 2, 3]:
        start.first_digits(digs=test, MAD=False, kl_diverg=True, simple=True)
    elif test == 22:
        start.second_digit(MAD=False, kl_diverg=True, simple=True)
    else:
        start.last_two_digits(MAD=False, kl_diverg=True, simple=True)
    return start.kl_diverg


[docs]def mad_summ(data, test, decimals=2, sign='all', verbose=False):
    """Calculate the Mean Absolute Deviation of the Summation Test

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: informs which base test to use for the summation mad.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.

    Returns:
        float: the Mean Absolute Deviation of the Summation Test
    """
    data = _check_num_array_(data)
    test = _check_digs_(test)

    start = Source(data, sign=sign, decimals=decimals, verbose=verbose)
    temp = start.loc[start.ZN >= 10 ** (test - 1)]
    temp[DIGS[test]] = (temp.ZN // 10 ** ((log10(temp.ZN).astype(
                                                int)) - (test - 1))).astype(
                                                    int)
    li = 1. / (9 * (10 ** (test - 1)))

    df = temp.groupby(DIGS[test]).sum()
    return mean(abs(df.ZN / df.ZN.sum() - li))


[docs]def rolling_mad(data, test, window, decimals=2, sign='all',
                show_plot=False, save_plot=None, save_plot_kwargs=None):
    """Applies the MAD to sequential subsets of the records.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: tells which test to use. 1: Fisrt Digits; 2: First Two Digits;
            3: First Three Digits; 22: Second Digit; and -2: Last Two Digits.
        window: size of the subset to be used.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.
        show_plot (bool): draws the test plot.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.
    
    Returns:
        Series with sequentially computed MADs.
    """
    data = _check_num_array_(data)
    r_mad = Roll_mad(data, test, window, decimals, sign)
    if show_plot:
        r_mad.show_plot(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    return r_mad.roll_series


[docs]def rolling_mse(data, test, window, decimals=2, sign='all',
                show_plot=False, save_plot=None, save_plot_kwargs=None):
    """Applies the MSE to sequential subsets of the records.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: tells which test to use. 1: Fisrt Digits; 2: First Two Digits;
            3: First Three Digits; 22: Second Digit; and -2: Last Two Digits.
        window: size of the subset to be used.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.
        show_plot (bool): draws the test plot.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.
    
    Returns:
        Series with sequentially computed MSEs.
    """
    data = _check_num_array_(data)
    r_mse = Roll_mse(data, test, window, decimals, sign)
    if show_plot:
        r_mse.show_plot(save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    return r_mse.roll_series


[docs]def duplicates(data, top_Rep=20, verbose=True, inform=None):
    """Performs a duplicates test and maps the duplicates count in descending
    order.

    Args:
        data: sequence to take the duplicates from. pandas Series or
            numpy Ndarray.
        verbose (bool): tells how many duplicated entries were found and prints the
            top numbers according to the top_Rep argument. Defaluts to True.
        top_Rep: chooses how many duplicated entries will be
            shown withe the top repititions. int or None. Defaluts to 20.
            If None, returns al the ordered repetitions.

    Returns:
        DataFrame with the duplicated records and their respective counts

    Raises:
        ValueError: if the `top_Rep` arg is not int or None.
    """
    verbose = _deprecate_inform_(verbose, inform)

    if top_Rep is not None and not isinstance(top_Rep, int):
        raise ValueError('The top_Rep argument must be an int or None.')

    if not isinstance(data, Series):
        try:
            data = Series(data)
        except ValueError:
            print('\ndata must be a numpy Ndarray or a pandas Series.')

    dup = data.loc[data.duplicated(keep=False)]
    dup_count = dup.value_counts()

    dup_count.index.names = ['Entries']
    dup_count.name = 'Count'

    if verbose:
        print(f'\nFound {len(dup_count)} duplicated entries.\n'
              f'The entries with the {top_Rep} highest repitition counts are:')
        print(dup_count.head(top_Rep))

    return dup_count


[docs]def second_order(data, test, decimals=2, sign='all', verbose=True, MAD=False,
                 confidence=None, high_Z='pos', limit_N=None, MSE=False,
                 show_plot=True, save_plot=None, save_plot_kwargs=None, inform=None):
    """Performs the chosen test after subtracting the ordered sequence by itself.
    Hence Second Order.

    Args:
        data: sequence of numbers to be evaluated. Must be a numpy 1D array,
            a pandas Series or a pandas DataFrame column, with values being
            integers or floats.
        test: the test to be performed - 1 or 'F1D': First Digit; 2 or 'F2D':
            First Two Digits; 3 or 'F3D': First three Digits; 22 or 'SD':
            Second Digits; -2 or 'L2D': Last Two Digits.
        decimals: number of decimal places to consider. Defaluts to 2.
            If integers, set to 0. If set to -infer-, it will remove the zeros
            and consider up to the fifth decimal place to the right, but will
            loose performance.
        sign: tells which portion of the data to consider. pos: only the positive
            entries; neg: only negative entries; all: all entries but zeros.
            Defaults to all.
        verbose (bool): tells the number of registries that are being subjected to
            the analysis and returns tha analysis DataFrame sorted by the
            highest Z score down. Defaults to True.
        MAD (bool): calculates the Mean Absolute Difference between the
            found and the expected distributions; defaults to False.
        confidence (int, float): confidence level to draw lower and upper limits when
            plotting and to limit the top deviations to show. Defaults to None.
        high_Z (int): chooses which Z scores to be used when displaying results,
            according to the confidence level chosen. Defaluts to 'pos',
            which will highlight only values higher than the expexted
            frequencies; 'all' will highlight both extremes (positive and
            negative); and an integer, which will use the first n entries,
            positive and negative, regardless of whether Z is higher than
            the confidence or not.
        limit_N (int): sets a limit to N as the sample size for the calculation of
            the Z scores if the sample is too big. Defaults to None.
        MSE (bool): calculates the Mean Square Error of the sample; defaults to
            False.
        chi_square: calculates the chi_square statistic of the sample and
            compares it with a critical value, according to the confidence
            level chosen and the series's degrees of freedom. Defaults to
            False. Requires confidence != None.
        KS: calculates the Kolmogorov-Smirnov test, comparing the cumulative
            distribution of the sample with the Benford's, according to the
            confidence level chosen. Defaults to False. Requires confidence
            != None.
        show_plot (bool): draws the test plot.
        save_plot (str): string with the path/name of the file in which the generated
            plot will be saved. Uses matplotlib.pyplot.savefig(). File format
            is infered by the file name extension. Only available when
            plot=True.
        save_plot_kwargs (dict): any of the kwargs accepted by
            matplotlib.pyplot.savefig()
            https://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html
            Only available when plot=True and save_plot is a string with the
            figure file path/name.
    
    Returns:
        DataFrame of the test chosen, but applied on Second Order pre-
            processed data.
    """
    test = _check_test_(test)

    verbose = _deprecate_inform_(verbose, inform)

    data = Source(data, decimals=decimals, sign=sign,
                  sec_order=True, verbose=verbose)
    if test in [1, 2, 3]:
        data.first_digits(digs=test, MAD=MAD,
                          confidence=confidence, high_Z=high_Z,
                          limit_N=limit_N, MSE=MSE, show_plot=show_plot,
                          save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    elif test == 22:
        data.second_digit(MAD=MAD, confidence=confidence, high_Z=high_Z,
                          limit_N=limit_N, MSE=MSE, show_plot=show_plot,
                          save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    else:
        data.last_two_digits(MAD=MAD, confidence=confidence, high_Z=high_Z,
                             limit_N=limit_N, MSE=MSE, show_plot=show_plot,
                             save_plot=save_plot, save_plot_kwargs=save_plot_kwargs)
    return data