Source code for fancytools.stats.findBestDistribution

import numpy as np
import scipy.stats as st
import warnings
#idea taken from http://stackoverflow.com/questions/21623717/fit-data-to-all-possible-distributions-and-return-the-best-fit


[docs]def findBestDistribution(data, disttype='continuous'):
    '''
    find best statistical distribution,see:
    http://docs.scipy.org/doc/scipy-0.14.0/reference/stats.html

    currently only continuous distributions embedded

    returns scipy.stats[DIST], fitparams
    '''

    if disttype != 'continuous':
        raise NotImplemented()

    distributions = [st.alpha, st.anglit, st.arcsine,
                     st.beta, st.betaprime, st.bradford,
                     st.burr, st.cauchy,st.chi,
                     st.chi2, st.cosine, st.dgamma,
                     st.dweibull, st.erlang, st.expon,
                     st.exponweib,st.exponpow, st.f,
                     st.fatiguelife, 
                    st.fisk, st.foldcauchy , st.foldnorm ,
                    st.frechet_r  ,st.frechet_l, st.genlogistic, st.genpareto,
                    st.genexpon, st.genextreme, st.gausshyper,
                    st.gamma,st.gengamma, st.genhalflogistic,
                    st.gilbrat,st.gompertz, st.gumbel_r,
                    st.gumbel_l ,st.halfcauchy ,st.halflogistic,
                    st.halfnorm , st.hypsecant, st.invgamma,
                    st.invgauss,   st.invweibull,  st.johnsonsb,
                    st.johnsonsu  ,  st.ksone,   st.kstwobign,
                    st.laplace,   st.logistic,    st.loggamma,
                    st.loglaplace  , st.lognorm , st.lomax,
                    st.maxwell, st.mielke, st.nakagami,
                    st.ncx2, st.ncf, st.nct,
                    st.norm, st.pareto, st.pearson3,
                    st.powerlaw , st.powerlognorm, 
                    st.powernorm, st.rdist, st.reciprocal, 
                    st.rayleigh,st.rice , st.recipinvgauss, 
                    st.semicircular, st.t, st.triang,
                    st.truncexpon, st.truncnorm, st.tukeylambda,
                    st.uniform, st.vonmises,  st.wald,
                    st.weibull_min, st.weibull_max, st.wrapcauchy]

    mles = []
    pars_l = []
    valid_dist = []

    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')

        for distribution in distributions:
            pars = distribution.fit(data)
            #maximum-likelihood estimation:
            mle = distribution.nnlf(pars, data)
            if np.isfinite(mle):
                valid_dist.append(distribution)
                mles.append(abs(mle))
                pars_l.append(pars)

    best_fit = sorted(zip(valid_dist, mles, pars_l), key=lambda d: d[1])[0]
    print 'Best fit reached using {}, MLE value: {}'.format(best_fit[0].name, best_fit[1])
    print pars_l
    return best_fit[0], best_fit[2]



if __name__ == '__main__':
    import sys
    import pylab as plt
    # generate a random sample using a given distribution:
    n = 1000
    nn = 100
    # choose a distribution:
    data = st.norm.rvs(size=n)
    # find best dist:
    dist, param = findBestDistribution(data)

    if 'no_window' not in sys.argv:
        plt.hist(data, normed=True, histtype='stepfilled', alpha=0.2)
        x = np.linspace(data.min(), data.max(), nn)
        plt.plot(x, dist.pdf(x, *param))
        plt.show()