1
2 """module for performing statistical calculations.
3
4 (c) 2007-2012 Matt Hilton
5
6 (c) 2013 Matt Hilton & Steven Boada
7
8 U{http://astlib.sourceforge.net}
9
10 This module (as you may notice) provides very few statistical routines. It does, however, provide
11 biweight (robust) estimators of location and scale, as described in Beers et al. 1990 (AJ, 100,
12 32), in addition to a robust least squares fitting routine that uses the biweight transform.
13
14 Some routines may fail if they are passed lists with few items and encounter a `divide by zero'
15 error. Where this occurs, the function will return None. An error message will be printed to the
16 console when this happens if astStats.REPORT_ERRORS=True (the default). Testing if an
17 astStats function returns None can be used to handle errors in scripts.
18
19 For extensive statistics modules, the Python bindings for GNU R (U{http://rpy.sourceforge.net}), or
20 SciPy (U{http://www.scipy.org}) are suggested.
21
22 """
23
24 import math
25 import numpy
26 import sys
27
28 REPORT_ERRORS=True
29
30
32 """Calculates the mean average of a list of numbers.
33
34 @type dataList: list
35 @param dataList: input data, must be a one dimensional list
36 @rtype: float
37 @return: mean average
38
39 """
40 sum=0
41 for item in dataList:
42 sum=sum+float(item)
43 if len(dataList)>0:
44 mean=sum/float(len(dataList))
45 else:
46 mean=0
47 return mean
48
49
51 """Calculates the weighted mean average of a two dimensional list (value, weight) of
52 numbers.
53
54 @type dataList: list
55 @param dataList: input data, must be a two dimensional list in format [value, weight]
56 @rtype: float
57 @return: weighted mean average
58
59 """
60 sum=0
61 weightSum=0
62 for item in dataList:
63 sum=sum+float(item[0]*item[1])
64 weightSum=weightSum+item[1]
65 if len(dataList)>0:
66 mean=sum/weightSum
67 else:
68 mean=0
69 return mean
70
71
73 """Calculates the (sample) standard deviation of a list of numbers.
74
75 @type dataList: list
76 @param dataList: input data, must be a one dimensional list
77 @rtype: float
78 @return: standard deviation
79
80 """
81 listMean=mean(dataList)
82 sum=0
83 for item in dataList:
84 sum=sum+(float(item-listMean)*float(item-listMean))
85 if len(dataList)>0:
86 stdev=math.sqrt(sum/(float(len(dataList))-1))
87 else:
88 stdev=0
89 return stdev
90
91
93 """Calculates the root mean square of a list of numbers.
94
95 @type dataList: list
96 @param dataList: input data, must be a one dimensional list
97 @rtype: float
98 @return: root mean square
99
100 """
101 dataListSq=[]
102 for item in dataList:
103 dataListSq.append(item*item)
104 listMeanSq=mean(dataListSq)
105 rms=math.sqrt(listMeanSq)
106
107 return rms
108
109
111 """Calculates the weighted (sample) standard deviation of a list of numbers.
112
113 @type dataList: list
114 @param dataList: input data, must be a two dimensional list in format [value, weight]
115 @rtype: float
116 @return: weighted standard deviation
117
118 @note: Returns None if an error occurs.
119
120 """
121 listMean=weightedMean(dataList)
122 sum=0
123 wSum=0
124 wNonZero=0
125 for item in dataList:
126 if item[1]>0.0:
127 sum=sum+float((item[0]-listMean)/item[1])*float((item[0]-listMean)/item[1])
128 wSum=wSum+float(1.0/item[1])*float(1.0/item[1])
129
130 if len(dataList)>1:
131 nFactor=float(len(dataList))/float(len(dataList)-1)
132 stdev=math.sqrt(nFactor*(sum/wSum))
133 else:
134 if REPORT_ERRORS==True:
135 print("""ERROR: astStats.weightedStdev() : dataList contains < 2 items.""")
136 stdev=None
137 return stdev
138
139
168
169
171 """Returns an estimate of the mode of a set of values by mode=(3*median)-(2*mean).
172
173 @type dataList: list
174 @param dataList: input data, must be a one dimensional list
175 @rtype: float
176 @return: estimate of mode average
177
178 """
179 mode=(3*median(dataList))-(2*mean(dataList))
180
181 return mode
182
183
185 """Calculates the Median Absolute Deviation of a list of numbers.
186
187 @type dataList: list
188 @param dataList: input data, must be a one dimensional list
189 @rtype: float
190 @return: median absolute deviation
191
192 """
193 listMedian=median(dataList)
194
195
196 diffModuli=[]
197 for item in dataList:
198 diffModuli.append(math.fabs(item-listMedian))
199 diffModuli.sort()
200
201 midValue=float(len(diffModuli)/2.0)
202 fractPart=math.modf(midValue)[0]
203
204 if fractPart==0.5:
205 midValue=math.ceil(midValue)
206
207
208 if midValue<len(diffModuli)-1:
209 MAD=diffModuli[int(midValue)]
210
211 if fractPart!=0.5:
212 prevItem=diffModuli[int(midValue)-1]
213 MAD=(MAD+prevItem)/2.0
214
215 else:
216 MAD=diffModuli[0]
217
218 return MAD
219
220
222 """Calculates the biweight location estimator (like a robust average) of a list of
223 numbers.
224
225 @type dataList: list
226 @param dataList: input data, must be a one dimensional list
227 @type tuningConstant: float
228 @param tuningConstant: 6.0 is recommended.
229 @rtype: float
230 @return: biweight location
231
232 @note: Returns None if an error occurs.
233
234 """
235 C=tuningConstant
236 listMedian=median(dataList)
237 listMAD=MAD(dataList)
238 if listMAD!=0:
239 uValues=[]
240 for item in dataList:
241 uValues.append((item-listMedian)/(C*listMAD))
242
243 top=0
244 bottom=0
245 for i in range(len(uValues)):
246 if math.fabs(uValues[i])<=1.0:
247 top=top+((dataList[i]-listMedian) \
248 *(1.0-(uValues[i]*uValues[i])) \
249 *(1.0-(uValues[i]*uValues[i])))
250
251 bottom=bottom+((1.0-(uValues[i]*uValues[i])) \
252 *(1.0-(uValues[i]*uValues[i])))
253
254 CBI=listMedian+(top/bottom)
255
256 else:
257 if REPORT_ERRORS==True:
258 print("""ERROR: astStats: biweightLocation() : MAD() returned 0.""")
259 return None
260
261 return CBI
262
263
265 """Calculates the biweight scale estimator (like a robust standard deviation) of a list
266 of numbers.
267
268 @type dataList: list
269 @param dataList: input data, must be a one dimensional list
270 @type tuningConstant: float
271 @param tuningConstant: 9.0 is recommended.
272 @rtype: float
273 @return: biweight scale
274
275 @note: Returns None if an error occurs.
276
277 """
278 C=tuningConstant
279
280
281 listMedian=median(dataList)
282 listMAD=MAD(dataList)
283 diffModuli=[]
284 for item in dataList:
285 diffModuli.append(math.fabs(item-listMedian))
286 uValues=[]
287 for item in dataList:
288 try:
289 uValues.append((item-listMedian)/(C*listMAD))
290 except ZeroDivisionError:
291 if REPORT_ERRORS==True:
292 print("""ERROR: astStats.biweightScale() : divide by zero error.""")
293 return None
294
295 top=0
296 bottom=0
297 valCount=0
298
299 for i in range(len(uValues)):
300
301 if math.fabs(uValues[i])<=1.0:
302 u2Term=1.0-(uValues[i]*uValues[i])
303 u4Term=math.pow(u2Term, 4)
304 top=top+((diffModuli[i]*diffModuli[i])*u4Term)
305 bottom=bottom+(u2Term*(1.0-(5.0*(uValues[i]*uValues[i]))))
306 valCount=valCount+1
307
308 top=math.sqrt(top)
309 bottom=math.fabs(bottom)
310
311 SBI=math.pow(float(valCount), 0.5)*(top/bottom)
312 return SBI
313
314
316 """Iteratively calculates biweight location and scale, using sigma clipping, for a list
317 of values. The calculation is performed on the first column of a multi-dimensional
318 list; other columns are ignored.
319
320 @type dataList: list
321 @param dataList: input data
322 @type tuningConstant: float
323 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
324 scale estimates
325 @type sigmaCut: float
326 @param sigmaCut: sigma clipping to apply
327 @rtype: dictionary
328 @return: estimate of biweight location, scale, and list of non-clipped data, in the format
329 {'biweightLocation', 'biweightScale', 'dataList'}
330
331 @note: Returns None if an error occurs.
332
333 """
334
335 iterations=0
336 clippedValues=[]
337 for row in dataList:
338 if type(row)==list:
339 clippedValues.append(row[0])
340 else:
341 clippedValues.append(row)
342
343 while iterations<11 and len(clippedValues)>5:
344
345 cbi=biweightLocation(clippedValues, tuningConstant)
346 sbi=biweightScale(clippedValues, tuningConstant)
347
348
349
350
351 if cbi==None or sbi==None:
352
353 if REPORT_ERRORS==True:
354 print("""ERROR: astStats : biweightClipped() :
355 divide by zero error.""")
356
357 return None
358
359 else:
360
361 clippedValues=[]
362 clippedData=[]
363 for row in dataList:
364 if type(row)==list:
365 if row[0]>cbi-(sigmaCut*sbi) \
366 and row[0]<cbi+(sigmaCut*sbi):
367 clippedValues.append(row[0])
368 clippedData.append(row)
369 else:
370 if row>cbi-(sigmaCut*sbi) \
371 and row<cbi+(sigmaCut*sbi):
372 clippedValues.append(row)
373 clippedData.append(row)
374
375 iterations=iterations+1
376
377 return { 'biweightLocation':cbi ,
378 'biweightScale':sbi,
379 'dataList':clippedData}
380
381
410
411
413 """Performs an ordinary least squares fit on a two dimensional list of numbers.
414 Minimum number of data points is 5.
415
416 @type dataList: list
417 @param dataList: input data, must be a two dimensional list in format [x, y]
418 @rtype: dictionary
419 @return: slope and intercept on y-axis, with associated errors, in the format
420 {'slope', 'intercept', 'slopeError', 'interceptError'}
421
422 @note: Returns None if an error occurs.
423
424 """
425 sumX=0
426 sumY=0
427 sumXY=0
428 sumXX=0
429 n=float(len(dataList))
430 if n > 2:
431 for item in dataList:
432 sumX=sumX+item[0]
433 sumY=sumY+item[1]
434 sumXY=sumXY+(item[0]*item[1])
435 sumXX=sumXX+(item[0]*item[0])
436 m=((n*sumXY)-(sumX*sumY))/((n*sumXX)-(sumX*sumX))
437 c=((sumXX*sumY)-(sumX*sumXY))/((n*sumXX)-(sumX*sumX))
438
439 sumRes=0
440 for item in dataList:
441
442 sumRes=sumRes+((item[1]-(m*item[0])-c) \
443 *(item[1]-(m*item[0])-c))
444
445 sigma=math.sqrt((1.0/(n-2))*sumRes)
446
447 try:
448 mSigma=(sigma*math.sqrt(n))/math.sqrt((n*sumXX)-(sumX*sumX))
449 except:
450 mSigma=numpy.nan
451 try:
452 cSigma=(sigma*math.sqrt(sumXX))/math.sqrt((n*sumXX)-(sumX*sumX))
453 except:
454 cSigma=numpy.nan
455 else:
456 if REPORT_ERRORS==True:
457 print("""ERROR: astStats.OLSFit() : dataList contains < 3 items.""")
458
459 return None
460
461 return {'slope':m,
462 'intercept':c,
463 'slopeError':mSigma,
464 'interceptError':cSigma}
465
466
468 """Calculates the clipped mean and stdev of a list of numbers.
469
470 @type dataList: list
471 @param dataList: input data, one dimensional list of numbers
472 @type sigmaCut: float
473 @param sigmaCut: clipping in Gaussian sigma to apply
474 @type maxIterations: int
475 @param maxIterations: maximum number of iterations
476 @rtype: dictionary
477 @return: format {'clippedMean', 'clippedStdev', 'numPoints'}
478
479 """
480
481 listCopy=[]
482 for d in dataList:
483 listCopy.append(d)
484 listCopy=numpy.array(listCopy)
485
486 iterations=0
487 while iterations < maxIterations and len(listCopy) > 4:
488
489 m=listCopy.mean()
490 s=listCopy.std()
491
492 listCopy=listCopy[numpy.less(abs(listCopy), abs(m+sigmaCut*s))]
493
494 iterations=iterations+1
495
496 return {'clippedMean': m, 'clippedStdev': s, 'numPoints': listCopy.shape[0]}
497
498
500 """Performs a weighted least squares fit on a list of numbers with sigma clipping. Minimum number of data
501 points is 5.
502
503 @type dataList: list
504 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
505 @rtype: dictionary
506 @return: slope and intercept on y-axis, with associated errors, in the format
507 {'slope', 'intercept', 'slopeError', 'interceptError'}
508
509 @note: Returns None if an error occurs.
510
511 """
512
513 iterations=0
514 clippedValues=[]
515 for row in dataList:
516 clippedValues.append(row)
517
518 while iterations<11 and len(clippedValues)>4:
519
520 fitResults=weightedLSFit(clippedValues, "errors")
521
522 if fitResults['slope'] == None:
523
524 if REPORT_ERRORS==True:
525 print("""ERROR: astStats : clippedWeightedLSFit() :
526 divide by zero error.""")
527
528 return None
529
530 else:
531
532 clippedValues=[]
533 for row in dataList:
534
535
536 fit=fitResults['slope']*row[0]+fitResults['intercept']
537 res=row[1]-fit
538 if abs(res)/row[2] < sigmaCut:
539 clippedValues.append(row)
540
541 iterations=iterations+1
542
543
544 fitResults['numDataPoints']=len(clippedValues)
545
546 return fitResults
547
548
550 """Performs a weighted least squares fit on a three dimensional list of numbers [x, y, y error].
551
552 @type dataList: list
553 @param dataList: input data, must be a three dimensional list in format [x, y, y error]
554 @type weightType: string
555 @param weightType: if "errors", weights are calculated assuming the input data is in the
556 format [x, y, error on y]; if "weights", the weights are assumed to be already calculated and
557 stored in a fourth column [x, y, error on y, weight] (as used by e.g. L{astStats.biweightLSFit})
558 @rtype: dictionary
559 @return: slope and intercept on y-axis, with associated errors, in the format
560 {'slope', 'intercept', 'slopeError', 'interceptError'}
561
562 @note: Returns None if an error occurs.
563
564 """
565 if weightType == "weights":
566 sumW=0
567 sumWX=0
568 sumWY=0
569 sumWXY=0
570 sumWXX=0
571 n=float(len(dataList))
572 if n > 4:
573 for item in dataList:
574 W=item[3]
575 sumWX=sumWX+(W*item[0])
576 sumWY=sumWY+(W*item[1])
577 sumWXY=sumWXY+(W*item[0]*item[1])
578 sumWXX=sumWXX+(W*item[0]*item[0])
579 sumW=sumW+W
580
581
582 try:
583 m=((sumW*sumWXY)-(sumWX*sumWY)) \
584 /((sumW*sumWXX)-(sumWX*sumWX))
585 except ZeroDivisionError:
586 if REPORT_ERRORS == True:
587 print("ERROR: astStats.weightedLSFit() : divide by zero error.")
588 return None
589
590 try:
591 c=((sumWXX*sumWY)-(sumWX*sumWXY)) \
592 /((sumW*sumWXX)-(sumWX*sumWX))
593 except ZeroDivisionError:
594 if REPORT_ERRORS == True:
595 print("ERROR: astStats.weightedLSFit() : divide by zero error.")
596 return None
597
598 sumRes=0
599 for item in dataList:
600
601 sumRes=sumRes+((item[1]-(m*item[0])-c) \
602 *(item[1]-(m*item[0])-c))
603
604 sigma=math.sqrt((1.0/(n-2))*sumRes)
605
606
607
608 if (n*sumWXX)-(sumWX*sumWX)>0.0:
609
610 mSigma=(sigma*math.sqrt(n)) \
611 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
612
613 cSigma=(sigma*math.sqrt(sumWXX)) \
614 /math.sqrt((n*sumWXX)-(sumWX*sumWX))
615
616 else:
617
618 if REPORT_ERRORS==True:
619 print("""ERROR: astStats.weightedLSFit()
620 : divide by zero error.""")
621 return None
622
623 else:
624 if REPORT_ERRORS==True:
625 print("""ERROR: astStats.weightedLSFit() :
626 dataList contains < 5 items.""")
627 return None
628
629 elif weightType == "errors":
630 sumX=0
631 sumY=0
632 sumXY=0
633 sumXX=0
634 sumSigma=0
635 n=float(len(dataList))
636 for item in dataList:
637 sumX=sumX+(item[0]/(item[2]*item[2]))
638 sumY=sumY+(item[1]/(item[2]*item[2]))
639 sumXY=sumXY+((item[0]*item[1])/(item[2]*item[2]))
640 sumXX=sumXX+((item[0]*item[0])/(item[2]*item[2]))
641 sumSigma=sumSigma+(1.0/(item[2]*item[2]))
642 delta=(sumSigma*sumXX)-(sumX*sumX)
643 m=((sumSigma*sumXY)-(sumX*sumY))/delta
644 c=((sumXX*sumY)-(sumX*sumXY))/delta
645 mSigma=math.sqrt(sumSigma/delta)
646 cSigma=math.sqrt(sumXX/delta)
647
648 return {'slope':m,
649 'intercept':c,
650 'slopeError':mSigma,
651 'interceptError':cSigma}
652
653
655 """Performs a weighted least squares fit, where the weights used are the biweight
656 transforms of the residuals to the previous best fit .i.e. the procedure is iterative,
657 and converges very quickly (iterations is set to 10 by default). Minimum number of data
658 points is 10.
659
660 This seems to give slightly different results to the equivalent R routine, so use at your
661 own risk!
662
663 @type dataList: list
664 @param dataList: input data, must be a three dimensional list in format [x, y, y weight]
665 @type tuningConstant: float
666 @param tuningConstant: 6.0 is recommended for location estimates, 9.0 is recommended for
667 scale estimates
668 @type sigmaCut: float
669 @param sigmaCut: sigma clipping to apply (set to None if not required)
670 @rtype: dictionary
671 @return: slope and intercept on y-axis, with associated errors, in the format
672 {'slope', 'intercept', 'slopeError', 'interceptError'}
673
674 @note: Returns None if an error occurs.
675
676 """
677
678 dataCopy=[]
679 for row in dataList:
680 dataCopy.append(row)
681
682
683 results=OLSFit(dataCopy)
684 origLen=len(dataCopy)
685 for k in range(10):
686 m=results['slope']
687 c=results['intercept']
688 res=[]
689 for item in dataCopy:
690 res.append((m*item[0]+c)-item[1])
691
692 if len(res)>5:
693
694
695 if sigmaCut != None:
696 absRes=[]
697 for item in res:
698 absRes.append(abs(item))
699 sigma=stdev(absRes)
700 count=0
701 for item in absRes:
702 if item>(sigmaCut*sigma) \
703 and len(dataCopy)>2:
704 del dataCopy[count]
705 del res[count]
706
707
708
709
710 count=count-1
711
712 count=count+1
713
714
715 weights=biweightTransform(res, tuningConstant)
716
717
718
719 wData=[]
720 for i in range(len(dataCopy)):
721 wData.append([dataCopy[i][0], dataCopy[i][1], dataCopy[i][2], weights[i][1]])
722
723 results=weightedLSFit(wData, "weights")
724
725 return results
726
727
729 """Bins the input data cumulatively.
730
731 @param data: input data, must be a one dimensional list
732 @type binMin: float
733 @param binMin: minimum value from which to bin data
734 @type binMax: float
735 @param binMax: maximum value from which to bin data
736 @type binTotal: int
737 @param binTotal: number of bins
738 @rtype: list
739 @return: binned data, in format [bin centre, frequency]
740
741 """
742
743 binStep=float(binMax-binMin)/binTotal
744 bins=[]
745 totalItems=len(data)
746 for i in range(binTotal):
747 bins.append(0)
748 for item in data:
749 if item>(binMin+(i*binStep)):
750 bins[i]=bins[i]+1.0/totalItems
751
752
753 coords=[]
754 for i in range(binTotal):
755 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
756
757 return coords
758
759
760 -def binner(data, binMin, binMax, binTotal):
761 """Bins the input data..
762
763 @param data: input data, must be a one dimensional list
764 @type binMin: float
765 @param binMin: minimum value from which to bin data
766 @type binMax: float
767 @param binMax: maximum value from which to bin data
768 @type binTotal: int
769 @param binTotal: number of bins
770 @rtype: list
771 @return: binned data, in format [bin centre, frequency]
772
773 """
774
775 binStep=float(binMax-binMin)/binTotal
776 bins=[]
777 for i in range(binTotal):
778 bins.append(0)
779 for item in data:
780 if item>(binMin+(i*binStep)) \
781 and item<=(binMin+((i+1)*binStep)):
782 bins[i]=bins[i]+1
783
784
785 coords=[]
786 for i in range(binTotal):
787 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
788
789 return coords
790
791
793 """Bins the input data, recorded frequency is sum of weights in bin.
794
795 @param data: input data, must be a one dimensional list
796 @type binMin: float
797 @param binMin: minimum value from which to bin data
798 @type binMax: float
799 @param binMax: maximum value from which to bin data
800 @type binTotal: int
801 @param binTotal: number of bins
802 @rtype: list
803 @return: binned data, in format [bin centre, frequency]
804
805 """
806
807 binStep=float(binMax-binMin)/binTotal
808 bins=[]
809 for i in range(binTotal):
810 bins.append(0.0)
811 for item, weight in zip(data, weights):
812 if item>(binMin+(i*binStep)) \
813 and item<=(binMin+((i+1)*binStep)):
814 bins[i]=bins[i]+weight
815
816
817 coords=[]
818 for i in range(binTotal):
819 coords.append([binMin+(float(i+0.5)*binStep), bins[i]])
820
821 return coords
822
823
824