在Python中实现SAS的PROC MEANS功能

在SAS中,使用PROC MEANS过程配合WEIGHT和OUTPUT等语句可以很方便地计算不同组合下的N, SUMWGT, SUM, MEAN, MIN, MAX, STD等统计量。Python是免费开源的编程语言,配合pandas库可以完成很多的统计计算和科学数据分析工作。

那么,python中有没有类似的代码来完成SAS中的PROC MEANS任务呢?参考Stack Overflow上的一篇问答帖子,我测试并重新整理成了以下完整的代码以及用法示例。读者可以很方便地在FUNC_GENS部分加入更多自己的函数。

#!/usr/bin/env python3
# coding: utf-8

import pandas as pd

def proc_means(data, var_names_in, var_names_out, var_functions, 
               var_name_weight = None, var_names_group = None):
    """ An implementation of SAS PROC MEANS in python
    
        See: Creating a partial SAS PROC SUMMARY replacement in Python/Pandas
        http://stackoverflow.com/questions/29926940/creating-a-partial-sas-proc-summary-replacement-in-python-pandas
    
        Arguments:
        data:            A pandas dataframe.
        var_names_in:    A list of variable names in the dataframe to be used for calculation.
        var_names_out:   A list of variable names to be returned after the calculation.
        var_functions:   A list of functions, as shown in FUNC_GENS.
        var_name_weight: The weight factor.
        var_names_group: A list of variable names to be used for groupby.
    """
    FUNC_GENS = {
        "mean"  : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].mean(),
        "max"   : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].max(),
        "min"   : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].min(),
        "sumwgt": lambda var_name_in, var_name_weight: lambda x: x[var_name_weight].sum(),
        "count" : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].count() ,
        "sum":    lambda var_name_in, var_name_weight: lambda x: (x[var_name_in] * x[var_name_weight]).sum(),
        "wmean" : lambda var_name_in, var_name_weight: lambda x: (x[var_name_in] * x[var_name_weight]).sum() / x[var_name_weight].sum(),
        "std"   : lambda var_name_in, var_name_weight: lambda x: x[var_name_in].std(),
    }
    result = pd.DataFrame()
    if var_names_group is None:
        grouped = data.groupby(lambda x: True)
    else:
        grouped = data.groupby(var_names_group)
    for var_name_in, var_name_out, var_function in \
            zip(var_names_in,var_names_out,var_functions):
        func_gen = FUNC_GENS[var_function]
        func = func_gen(var_name_in, var_name_weight)
        result[var_name_out] = grouped.apply(func)
    return result

# Usage example:

df = pd.DataFrame(
    {'A' : ['foo', 'bar', 'foo', 'bar',
            'foo', 'bar', 'foo', 'foo'],
     'B' : ['one', 'one', 'two', 'three',
            'two', 'two', 'one', 'three'],
     'C' : [1,2,3,4,5,6,7,8],
     'D' : [100, 200, 300, 400, 500, 600, 700, 800]
    })
print(df)
'''
     A      B  C    D
0  foo    one  1  100
1  bar    one  2  200
2  foo    two  3  300
3  bar  three  4  400
4  foo    two  5  500
5  bar    two  6  600
6  foo    one  7  700
7  foo  three  8  800
'''

result = proc_means(
           data = df,
           var_names_in = ['D', 'D', 'D', 'D', 'D', 'D', 'D', 'D'],
           var_names_out = ['D_mean', 'D_max', 'D_min', 'D_sum', 'D_count', 'D_sumwgt', 'D_wmean', 'D_std'],
           var_functions=['mean','max', 'min', 'sum', 'count', 'sumwgt', 'wmean', 'std'] ,
           var_name_weight="C",
           var_names_group = ['A', 'B'])
print(result)
'''
           D_mean  D_max  D_min  D_sum  D_count  D_sumwgt  D_wmean       D_std
A   B
bar one       200    200    200    400        1         2      200         NaN
    three     400    400    400   1600        1         4      400         NaN
    two       600    600    600   3600        1         6      600         NaN
foo one       400    700    100   5000        2         8      625  424.264069
    three     800    800    800   6400        1         8      800         NaN
    two       400    500    300   3400        2         8      425  141.421356
'''

发表评论

电子邮件地址不会被公开。 必填项已用*标注