AMBHAS
stats.py
Go to the documentation of this file.
00001 # -*- coding: utf-8 -*-
00002 """
00003 Created on Thu Dec 29 15:24:08 2011
00004 
00005 @author: Sat Kumar Tomer
00006 @website: www.ambhas.com
00007 @email: satkumartomer@gmail.com
00008 """
00009 
00010 from __future__ import division
00011 import numpy as np
00012 import statistics as st
00013 from scipy.interpolate import interp1d
00014 from scipy.stats import norm, chi2
00015 from scipy.stats import scoreatpercentile
00016 
00017 def bias_correction(oc, mc, mp):
00018     """
00019     Input:
00020         oc: observed current
00021         mc: modeled current
00022         mp: modeled prediction     
00023     
00024     Output:
00025         mp_adjusted: adjusted modeled prediction
00026         
00027         
00028     """
00029     
00030     # convert the input arrays into one dimension
00031     oc = oc.flatten()
00032     mc = mc.flatten()
00033     mp = mp.flatten()    
00034     
00035     # Instead of directly inverting the CDF, linear interpolation using 
00036     # interp1d is used to invert the CDF.
00037     
00038     F_oc, OC = st.cpdf(oc, n=1000)
00039     f = interp1d(F_oc, OC)
00040     
00041     F1 = st.cpdf(mc, mp)
00042     mp_adjusted = f(F1)
00043     
00044     return mp_adjusted
00045 
00046 
00047 def mk_test(x, alpha = 0.05):
00048     """
00049     this perform the MK (Mann-Kendall) test to check if there is any trend present in 
00050     data or not
00051     
00052     Input:
00053         x:   a vector of data
00054         alpha: significance level
00055     
00056     Output:
00057         trend: tells the trend (increasing, decreasing or no trend)
00058         h: True (if trend is present) or False (if trend is absence)
00059         p: p value of the sifnificance test
00060         z: normalized test statistics 
00061         
00062     Examples
00063     --------
00064       >>> x = np.random.rand(100)
00065       >>> trend,h,p,z = mk_test(x,0.05) 
00066     """
00067     n = len(x)
00068     
00069     # calculate S 
00070     s = 0
00071     for k in xrange(n-1):
00072         for j in xrange(k+1,n):
00073             s += np.sign(x[j] - x[k])
00074     
00075     # calculate the unique data
00076     unique_x = np.unique(x)
00077     g = len(unique_x)
00078     
00079     # calculate the var(s)
00080     if n == g: # there is no tie
00081         var_s = (n*(n-1)*(2*n+5))/18
00082     else: # there are some ties in data
00083         tp = np.zeros(unique_x.shape)
00084         for i in xrange(len(unique_x)):
00085             tp[i] = sum(unique_x[i] == x)
00086         var_s = (n*(n-1)*(2*n+5) + np.sum(tp*(tp-1)*(2*tp+5)))/18
00087     
00088     if s>0:
00089         z = (s - 1)/np.sqrt(var_s)
00090     elif s == 0:
00091             z = 0
00092     elif s<0:
00093         z = (s + 1)/np.sqrt(var_s)
00094     
00095     # calculate the p_value
00096     p = 2*(1-norm.cdf(abs(z))) # two tail test
00097     h = abs(z) > norm.ppf(1-alpha/2) 
00098     
00099     if (z<0) and h:
00100         trend = 'decreasing'
00101     elif (z>0) and h:
00102         trend = 'increasing'
00103     else:
00104         trend = 'no trend'
00105         
00106     return trend, h, p, z
00107 
00108 def independant(x,y, alpha = 0.05):
00109     """
00110     this program calculates check if the joint cdf == multiplication of marginal
00111     distribution or not 
00112     using the chi-squared test 
00113         
00114     Input:
00115         x:   a vector of data
00116         y:   a vector of data
00117         alpha: significance level
00118     
00119     Output:
00120         ind: True (if independant) False (if dependant)
00121         p: p value of the significance test
00122         
00123     Examples
00124     --------
00125       >>> x = np.random.rand(100)
00126       >>> y = np.random.rand(100)
00127       >>> ind,p = independant(x,y,0.05)  
00128     """
00129     
00130     # calculate the 2D histogram 
00131     H, xedges, yedges = np.histogram2d(x, y, bins=5)
00132     
00133     # calculate the expected values
00134     expected_values = np.zeros(H.shape)
00135     for i in range(H.shape[0]):
00136         for j in range(H.shape[1]):
00137             expected_values[i,j] = H.sum(axis=1)[i]*H.sum(axis=0)[j]/H.sum()
00138     
00139     # calculate the chi-squared statistics
00140     err_chi2 = ((H-expected_values)**2/expected_values).sum()
00141     
00142     # degree of freedom
00143     dof = (H.shape[0]-1)*(H.shape[1]-1)
00144     
00145     # calculate the p_value
00146     rv = chi2(dof)
00147     p = 2*(1-rv.sf(err_chi2)) # two tail test
00148     
00149     # test 
00150     ind = p >= alpha        
00151         
00152     return ind, p
00153 
00154 
00155 class SpatOutlier():
00156     """
00157     this class identify the outliers from the given spatial data of point values
00158     """
00159     
00160     def __init__(self,rain):
00161         """
00162         Input:
00163             rain:   rain at different spatial locations and time
00164             time ==> is defined in the first dimension
00165             space ==> is defined in the second dimension
00166         """
00167         # check for the number of dimension
00168         if rain.ndim > 2:
00169             raise ValueError('The dimension of the input should be less than or equal to 2 (two)')
00170         elif rain.ndim == 1:
00171             rain.shape = (1,-1)
00172         self.rain = rain
00173             
00174     def _identify_outlier(self,threshold=2.0):
00175         """
00176         Input:
00177             threshold: threshold above which the data will be termed as outlier
00178         """
00179         rain = self.rain
00180         q_25 = scoreatpercentile(rain.T,25)
00181         q_75 = scoreatpercentile(rain.T,75)
00182         q_50 = scoreatpercentile(rain.T,50)
00183         
00184         q_25_m = np.tile(q_25,(rain.shape[1],1)).T
00185         q_50_m = np.tile(q_50,(rain.shape[1],1)).T
00186         q_75_m = np.tile(q_75,(rain.shape[1],1)).T
00187         
00188         index = np.abs(rain-q_50_m)/(q_75_m-q_25_m)
00189         self.index = index
00190         
00191         self.outliers = index>=threshold
00192     
00193     def fill_with_nan(self):
00194         """
00195         this method fills the outliers with the nan
00196         
00197         Output:
00198             rain_filled:    rain filled with nan where outliers were present
00199         """
00200         self._identify_outlier()
00201         
00202         rain_filled = self.rain
00203         rain_filled[self.outliers] = np.nan
00204         return rain_filled
00205 
00206 if __name__ == "__main__":
00207     oc = np.random.randn(100)
00208     mc = 2+np.random.randn(100)
00209     mp = 2+np.random.randn(1000)
00210     
00211     print("mean of observed current is %f"%oc.mean())
00212     print("mean of modeled current is %f"%mc.mean())
00213     print("mean of modeled prediction is %f"%mp.mean())
00214      
00215     mp_adjusted = bias_correction(oc, mc, mp)
00216     print("mean of adjusted modeled prediction is %f"%mp_adjusted.mean())
00217     
00218     # check the SpatOutlier class
00219     x = np.random.randn(5,20)
00220     x[4,4] = 2.9
00221     foo = SpatOutlier(x)
00222     x1 = foo.fill_with_nan()
00223     print x1[4,4]
00224 
00225     
 All Classes Namespaces Files Functions Variables