import numpy as np import matplotlib.pyplot as plt from pandas import Series, DataFrame import pandas as pd np.random.seed(12345) # Record the seed of the random number to ensure that each execution is the same random number plt.rc('figure', figsize=(10, 6)) from sklearn.metrics import confusion_matrix ''' In the field of machine learning, confusion matrix, also known as likelihood table or error matrix. It is a specific matrix used to visualize the performance of an algorithm, usually supervised learning (unsupervised learning, usually a matching matrix). Each column represents the predicted value, and each row represents the actual class. The name comes from the fact that it is very easy to indicate whether multiple classes are confused (that is, one class is predicted to be another class). ''' y_true=[2,1,0,1,2,0] y_pred=[2,0,0,1,2,1] C=confusion_matrix(y_true, y_pred) print( C ) ''' [[1 1 0] [1 1 0] [0 0 2]] ''' y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] cc = confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"]) print( cc ) ''' [[2 0 0] [0 0 1] [1 0 2]] ''' df = pd.DataFrame({'key1':list('aaaab'), 'key2': ['one','two','one','two','one'], 'data1': np.random.randn(5), 'data2': np.random.randn(5)}) print( df ) ''' data1 data2 key1 key2 0 -0.848172 -2.385541 a one 1 -0.453870 0.156512 a two 2 -0.336633 -0.323486 a one 3 -1.258714 1.339105 a two 4 0.669843 0.511622 b one ''' print (df.groupby('key1')) # <pandas.core.groupby.DataFrameGroupBy object at 0x000000000DB1EC18> print (df.groupby('key1').agg('sum')) ''' The df is obtained as follows: df is the dataframe in pd, groupby('column name'), which is equivalent to pre-classifying with this column. The print result is: data1 data2 key1 a -1.094335 2.781858 b -0.548833 1.198655 Then agg() is an operation on the above. Here is the sum, so add up: PS: I tried to select only the data1 column for calculation, so I wrote a df['data1'], which didn't work. Doing so only selects the data1 column! PS: df['data1'] is series type, df[['data1']] is dataframe type ''' #### Column-oriented multifunction application df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'], 'key2' : ['one', 'two', 'one', 'two', 'one'], 'data1' : np.random.randn(5), 'data2' : np.random.randn(5)}) tips = pd.read_csv('data/tips.csv') tips['tip_pct'] = tips['tip'] / tips['total_bill'] tips[:6] grouped = tips.groupby(['sex', 'smoker']) grouped_pct = grouped['tip_pct'] ''' The difference between the agg function and the apply function in pandas In fact, the book on data analysis with python does not clearly indicate the difference between these two functions, but that apply is more general. In fact, in the ninth chapter of this book "arrays and operations and transformations" point to a little difference between the two: agg is used for aggregation operations. The so-called aggregation is of course the composition of larger components, which is mentioned at the beginning of this section: Aggregation is just one of the grouping operations. It is a special case of data transformation, that is, it accepts functions that reduce a one-dimensional array to a scalar value. Of course, these two functions both act on the groupby object, that is, on the grouped object. If the value is a one-dimensional array, after using a specific function, if it can be simplified, agg can call, Conversely, if, for example, the custom function is sorting, or a function such as top defined on page 278 of the book, Of course, it can't be solved by agg. At this time, apply can be used to solve it. Because he is more general, there is no simplification, what one-dimensional array, what scalar value. ''' grouped_pct.agg('mean') #grouped_pct.agg(['mean', 'std', peak_to_peak]) grouped_pct.agg([('foo', 'mean'), ('bar', np.std)]) functions = ['count', 'mean', 'max'] result = grouped['tip_pct', 'total_bill'].agg(functions) print( result ) ''' tip_pct total_bill count mean max count mean max sex smoker Female No 54 0.156921 0.252672 54 18.105185 35.83 Yes 33 0.182150 0.416667 33 17.977879 44.30 Male No 97 0.160669 0.291990 97 19.791237 48.33 Yes 60 0.152771 0.710345 60 22.284500 50.81 ''' print( result['tip_pct'] ) ''' count mean max sex smoker Female No 54 0.156921 0.252672 Yes 33 0.182150 0.416667 Male No 97 0.160669 0.291990 Yes 60 0.152771 0.710345 ''' print( '-------------- ftuples --------------' ) ftuples = [('average', 'mean'), ('deviation', np.var)] print( ftuples ) ''' [('average', 'mean'), ('deviation', <function var at 0x00000000035C5510>)] ''' print( '-------------- grouped_1 --------------' ) grouped_1 = grouped['tip_pct', 'total_bill'].agg(ftuples) print( grouped_1 ) ''' tip_pct total_bill Average Deviation Average Deviation sex smoker Female No 0.156921 0.001327 18.105185 53.092422 Yes 0.182150 0.005126 17.977879 84.451517 Male No 0.160669 0.001751 19.791237 76.152961 Yes 0.152771 0.008206 22.284500 98.244673 ''' print( '-------------- grouped_2 --------------' ) grouped_2 = grouped.agg({'tip' : np.max, 'size' : 'sum'}) print( grouped_2 ) ''' size tip sex smoker Female No 140 5.2 Yes 74 6.5 Male No 263 9.0 Yes 150 10.0 ''' print( '-------------- grouped_3 --------------' ) grouped_3 = grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],'size' : 'sum'}) print( grouped_3 ) ''' size tip_pct sum min max mean std sex smoker Female No 140 0.056797 0.252672 0.156921 0.036421 Yes 74 0.056433 0.416667 0.182150 0.071595 Male No 263 0.071804 0.291990 0.160669 0.041849 Yes 150 0.035638 0.710345 0.152771 0.090588 ''' ###Group-level operations and transformations print( '-------------- df --------------' ) print( df ) ''' data1 data2 key1 key2 0 1.007189 0.886429 a one 1 -1.296221 -2.001637 a two 2 0.274992 -0.371843 b one 3 0.228913 1.669025 b two 4 1.352917 -0.438570 a one ''' k1_means = df.groupby('key1').mean().add_prefix('mean_') print( '-------------- k1_means --------------' ) print( k1_means ) ''' mean_data1 mean_data2 key1 a 0.354628 -0.517926 b 0.251952 0.648591 ''' _merge_11 = pd.merge(df, k1_means, left_on='key1', right_index=True) print ('-------------- _merge_11 --------------') print (_merge_11) ''' data1 data2 key1 key2 mean_data1 mean_data2 0 1.007189 0.886429 a one 0.354628 -0.517926 1 -1.296221 -2.001637 a two 0.354628 -0.517926 4 1.352917 -0.438570 a one 0.354628 -0.517926 2 0.274992 -0.371843 b one 0.251952 0.648591 3 0.228913 1.669025 b two 0.251952 0.648591 ''' people = DataFrame(np.random.randn(5, 5), columns=['a', 'b', 'c', 'd', 'e'], index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) print( '-------------- people --------------' ) print( people ) ''' a b c d e Joe -0.539741 0.476985 3.248944 -1.021228 -0.577087 Steve 0.124121 0.302614 0.523772 0.000940 1.343810 Wes -0.713544 -0.831154 -2.370232 -1.860761 -0.860757 Jim 0.560145 -1.265934 0.119827 -1.063512 0.332883 Travis -2.359419 -0.199543 -1.541996 -0.970736 -1.307030 ''' key = ['one', 'two', 'one', 'two', 'one'] people.groupby(key).mean() people.groupby(key).transform(np.mean) print( '-------------- people-groupby --------------' ) print( people ) ''' a b c d e Joe -0.539741 0.476985 3.248944 -1.021228 -0.577087 Steve 0.124121 0.302614 0.523772 0.000940 1.343810 Wes -0.713544 -0.831154 -2.370232 -1.860761 -0.860757 Jim 0.560145 -1.265934 0.119827 -1.063512 0.332883 Travis -2.359419 -0.199543 -1.541996 -0.970736 -1.307030 ''' def demean(arr): return arr - arr.mean() demeaned = people.groupby(key).transform(demean) print( '-------------- demeaned --------------' ) print( demeaned ) ''' a b c d e Joe 0.664493 0.661556 3.470038 0.263014 0.337871 Steve -0.218012 0.784274 0.201972 0.532226 0.505464 Wes 0.490691 -0.646583 -2.149137 -0.576519 0.054201 Jim 0.218012 -0.784274 -0.201972 -0.532226 -0.505464 Travis -1.155184 -0.014972 -1.320901 0.313505 -0.392072 ''' demeaned_2 = demeaned.groupby(key).mean() print( '-------------- demeaned_2 --------------' ) print( demeaned_2 ) ''' a b c d e one -7.401487e-17 1.850372e-17 -7.401487e-17 7.401487e-17 -1.110223e-16 two 2.775558e-17 -5.551115e-17 -1.387779e-17 0.000000e+00 0.000000e+00 ''' # ### apply method def top(df, n=5, column='tip_pct'): return df.sort_index(by=column)[-n:] top(tip, n=6) tips.groupby('smoker').apply(top) tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill') result = tips.groupby('smoker')['tip_pct'].describe() print( '-------------- result --------------' ) print( result ) ''' smoker No count 151.000000 mean 0.159328 std 0.039910 min 0.056797 25% 0.136906 50% 0.155625 75% 0.185014 max 0.291990 Yes count 93.000000 mean 0.163196 std 0.085119 min 0.035638 25% 0.106771 50% 0.153846 75% 0.195059 max 0.710345 Name: tip_pct, dtype: float64 ''' result_unstack = result.unstack('smoker') print( '-------------- result_unstack --------------' ) print( result_unstack ) ''' smoker No Yes count 151.000000 93.000000 mean 0.159328 0.163196 std 0.039910 0.085119 min 0.056797 0.035638 25% 0.136906 0.106771 50% 0.155625 0.153846 75% 0.185014 0.195059 max 0.291990 0.710345 ''' #f = lambda x: x.describe() #grouped.apply(f) # disable grouping keys tips.groupby('smoker', group_keys=False).apply(top) # ### Quantile and bucket analysis frame = DataFrame({'data1': np.random.randn(1000), 'data2': np.random.randn(1000)}) factor = pd.cut(frame.data1, 4) factor[:10] def get_stats(group): return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()} grouped = frame.data2.groupby(factor) grouped.apply(get_stats).unstack() grouping = pd.qcut(frame.data1, 10, labels=False) grouped = frame.data2.groupby(grouping) grouped.apply(get_stats).unstack() # ### Fill missing values with group-specific values s = Series(np.random.randn(6)) s [:: 2] = np.nan print( s ) ''' 0 NaN 1 -0.438053 2 NaN 3 0.401587 4 NaN 5 -0.574654 dtype: float64 ''' s.fillna(s.mean()) states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho'] group_key = ['East'] * 4 + ['West'] * 4 data = Series(np.random.randn(8), index=states) data[['Vermont', 'Nevada', 'Idaho']] = np.nan print( data ) ''' Ohio 0.786210 New York -1.393822 Vermont NaN Florida 1.170900 Oregon 0.678661 Nevada NaN California 0.150581 Idaho NaN dtype: float64 ''' data.groupby(group_key).mean() fill_mean = lambda g: g.fillna(g.mean()) data.groupby(group_key).apply(fill_mean) fill_values = {'East': 0.5, 'West': -1} fill_func = lambda g: g.fillna(fill_values[g.name]) data.groupby(group_key).apply(fill_func) print( data ) ''' Ohio 0.786210 New York -1.393822 Vermont NaN Florida 1.170900 Oregon 0.678661 Nevada NaN California 0.150581 Idaho NaN dtype: float64 '''
groupby - 2 of pandas aggregation and grouping operations
Guess you like
Origin http://43.154.161.224:23101/article/api/json?id=325394865&siteId=291194637
Recommended
Ranking