See also:   NumPy  | Matplotlib  | All Cheat Sheets
 

Pandas > All categories


Name Code Output
Add new columns
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe added new columns")
dataframe['C1p10']=dataframe['C1'] + 10
dataframe['C101']=101
print (dataframe)
dataframe
   C1  C2  C3
0  14  81  99
1  33  89  95
2  60  27  21
3  19  85  44
4  20  62  57
dataframe added new columns
   C1  C2  C3  C1p10  C101
0  14  81  99     24   101
1  33  89  95     43   101
2  60  27  21     70   101
3  19  85  44     29   101
4  20  62  57     30   101
Append dataframes
dataframe1 = pandas.DataFrame(numpy.random.randint(0,10,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,10,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = dataframe1.append(dataframe2,ignore_index = True)
print ("appended dataframes")
print (dataframe3)
dataframe1
   C1  C2
0   4   5
1   9   1
2   3   6
dataframe2
   C1  C2
0   3   8
1   8   8
2   5   6
appended dataframes
   C1  C2
0   4   5
1   9   1
2   3   6
3   3   8
4   8   8
5   5   6
Apply aggregate functions
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("aggregates")
aggregates = dataframe.agg(['sum', 'max','mean'])
print (aggregates)
dataframe
   C1  C2  C3
0   4   3   0
1   5   0   9
2   8   9   2
3   6   7   5
4   5   9   2
aggregates
        C1    C2    C3
sum   28.0  28.0  18.0
max    8.0   9.0   9.0
mean   5.6   5.6   3.6
Apply function along axis
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
# sum for columns
sum_columns = dataframe[['C1','C2']].apply(sum,axis=0)
print ("sum for columns")
print (sum_columns)
# sum for rows
sum_rows = dataframe[['C1','C2']].apply(sum,axis=1)
print ("sum for rows")
print (sum_rows)
dataframe
   C1  C2
0  56   2
1  94  89
2   0  56
3  62  80
4  80  98
sum for columns
C1    292
C2    325
dtype: int64
sum for rows
0     58
1    183
2     56
3    142
4    178
dtype: int64
Apply function to dataframe (map/lambda)
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
# cross-tabulation of two factors (default is a frequency table) 
dataframe['C1'] = dataframe['C1'].map(lambda x: x-100)
print ("modified dataframe")
print (dataframe)
dataframe
   C1  C2
0  27  35
1  46  38
2  21  98
3  34  30
4  76   8
modified dataframe
   C1  C2
0 -73  35
1 -54  38
2 -79  98
3 -66  30
4 -24   8
Apply function to each element
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
function_result = dataframe.applymap(lambda x: x*10)
print ("apply result")
print (function_result)
dataframe
   C1  C2
0  54  11
1  31  59
2  81  76
3  42  46
4   8  24
apply result
    C1   C2
0  540  110
1  310  590
2  810  760
3  420  460
4   80  240
Calculate correlation
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe correlation")
print (dataframe.corr())
dataframe
   C1  C2  C3
0  82  71  74
1  62  12   8
2  97  86  26
3  26  98  48
4   5  97  29
dataframe correlation
          C1        C2        C3
C1  1.000000 -0.339904  0.107885
C2 -0.339904  1.000000  0.456666
C3  0.107885  0.456666  1.000000
Calculate covariance
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe covariance")
print (dataframe.cov())
dataframe
   C1  C2  C3
0  59  45  78
1  24  43  84
2  65  68  56
3  72  21  54
4   4  14  44
dataframe covariance
        C1      C2     C3
C1  861.70  279.55   41.3
C2  279.55  459.70  152.2
C3   41.30  152.20  289.2
Calculate cross-tabluation of factors
dataframe = pandas.DataFrame(numpy.random.randint(0,4,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
# cross-tabulation of two factors (default is a frequency table) 
aggregate = pandas.crosstab(dataframe.C1, dataframe.C2)
print ("aggregate data")
print (aggregate)
dataframe
   C1  C2
0   0   3
1   2   2
2   3   2
3   2   1
4   2   1
aggregate data
C2  1  2  3
C1         
0   0  0  1
2   2  1  0
3   0  1  0
Change dataframe to numpy array
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("numpy array")
print (dataframe.values)
dataframe
   C1  C2  C3
0   3   0   0
1   9   8   0
2   4   3   5
3   4   5   9
4   5   6   7
numpy array
[[3 0 0]
 [9 8 0]
 [4 3 5]
 [4 5 9]
 [5 6 7]]
Check duplicates
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
# alternatively duplicates = dataframe.C1.duplicated()
duplicates = dataframe.duplicated('C1')
print ("duplicates")
print (duplicates)
dataframe
   C1  C2
0   9   1
1   2   2
2   6   1
3   9   7
4   0   7
duplicates
0    False
1    False
2    False
3     True
4    False
dtype: bool
Concatenate dataframes
dataframe1 = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = pandas.concat([dataframe1,dataframe2])
print ("concatenated dataframes")
print (dataframe3)
dataframe1
   C1  C2
0  37  43
1   8  19
2  69  23
dataframe2
   C1  C2
0   1  20
1  63   8
2  82  41
concatenated dataframes
   C1  C2
0  37  43
1   8  19
2  69  23
0   1  20
1  63   8
2  82  41
Create dataframe from dictionary
# creating dataframe from dictionary allows to specify different data source/type for each column

dataframe = pandas.DataFrame({
    'C1': pandas.date_range('20170101', periods=4),
    'C2' : [10,20,30,40],
    'C3': pandas.Categorical(['A','B','C','D']),
    'C4': 1})
print ("dataframe")
print (dataframe)
dataframe
          C1  C2 C3  C4
0 2017-01-01  10  A   1
1 2017-01-02  20  B   1
2 2017-01-03  30  C   1
3 2017-01-04  40  D   1
Create dataframe from numpy array
array = numpy.array([(1,2,3), (4,5,6),(7,8,9)])
dataframe = pandas.DataFrame(array,columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe
   C1  C2  C3
0   1   2   3
1   4   5   6
2   7   8   9
Create dataframe from random numpy array
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
dataframe
   C1  C2  C3  C4
0  27  32  73  54
1  63  21  46  34
2  68  89  31  36
3  16  47  49  64
4  28  20  82  84
Create multi-index dataframe
# muti index contains multiple numpy arrays

multi_index = [numpy.array(['Alpha','Beta','Gamma','Alpha','Beta']), numpy.array([1,2,3,4,5])]
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'],index = multi_index)
print ("dataframe")
print (dataframe)
print ("rows Alpha")
print (dataframe.loc['Alpha'])
print ("row Alpha 4")
print (dataframe.loc['Alpha',4])
dataframe
         C1  C2  C3
Alpha 1   7   7   6
Beta  2   9   3   1
Gamma 3   7   4   8
Alpha 4   7   3   2
Beta  5   4   1   3
rows Alpha
   C1  C2  C3
1   7   7   6
4   7   3   2
row Alpha 4
C1    7
C2    3
C3    2
Name: (Alpha, 4), dtype: int64
Create series from array
# series is one dimentional array with axis

series = pandas.Series([1,5,10,15,20])
print ("series")
print (series)
series
0     1
1     5
2    10
3    15
4    20
dtype: int64
Create series from dictionary
data = {'a' : 0., 'b' : 1., 'c' : 2.}
series = pandas.Series({'a' : 1, 'b' : 2, 'c' : 3})
print ("series")
print (series)
series
a    1
b    2
c    3
dtype: int64
Create series with date/time index
months_index = pandas.date_range('1/1/2015', periods=12, freq='M')
series = pandas.Series(numpy.random.randint(0,100,size=(12,)),index = months_index)
print ("series")
print (series)
series
2015-01-31    45
2015-02-28    59
2015-03-31    66
2015-04-30    50
2015-05-31    82
2015-06-30     2
2015-07-31    18
2015-08-31    41
2015-09-30    63
2015-10-31    37
2015-11-30    89
2015-12-31    31
Freq: M, dtype: int64
Dataframe lookup labels
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print ("Info axis")
print (dataframe.lookup([1,2],['C1','C1']))
dataframe
   C1  C2
0  62  41
1  67  49
2  12  96
Info axis
[67 12]
Dataframe transform using lambda function
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
new_dataframe = dataframe.transform(lambda x: x*100)
print ("transformed data")
print (new_dataframe)
dataframe
   C1  C2  C3
0   3   4   4
1   4   3   3
2   2   3   2
3   3   0   3
4   4   3   2
transformed data
    C1   C2   C3
0  300  400  400
1  400  300  300
2  200  300  200
3  300    0  300
4  400  300  200
Delete (drop) single column
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
# second parameter is axis, for columns is 1
dataframe = dataframe.drop('C1', 1)
print ("dataframe with removed column 1")
print (dataframe)
dataframe
   C1  C2  C3
0  73  97  36
1  97  79   8
2  22  10  80
3   0  92  69
4  54   8  75
dataframe with removed column 1
   C2  C3
0  97  36
1  79   8
2  10  80
3  92  69
4   8  75
Describe dataframe
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("describe dataframe")
print (dataframe.describe())
dataframe
   C1  C2  C3  C4
0  28  88  51  26
1   4  81  68  63
2  53  86  90  36
3  61  99  27  95
4  39  97  59  13
describe dataframe
              C1         C2         C3         C4
count   5.000000   5.000000   5.000000   5.000000
mean   37.000000  90.200000  59.000000  46.600000
std    22.394196   7.596052  23.075962  32.700153
min     4.000000  81.000000  27.000000  13.000000
25%    28.000000  86.000000  51.000000  26.000000
50%    39.000000  88.000000  59.000000  36.000000
75%    53.000000  97.000000  68.000000  63.000000
max    61.000000  99.000000  90.000000  95.000000
Display dataframe histogram
# requires: import matplotlib
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(10, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print ("dataframe histogram")
print (dataframe.hist())
dataframe
   C1  C2
0   4   4
1   3   3
2   4   4
3   2   0
4   3   0
5   1   2
6   4   1
7   4   1
8   0   3
9   4   4
dataframe histogram
[[<matplotlib.axes._subplots.AxesSubplot object at 0x117585f98>
  <matplotlib.axes._subplots.AxesSubplot object at 0x11762ada0>]]
Display dataframe histogram
Display dataframe plot
# requires: import matplotlib
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print ("show plot")
dataframe.plot()
matplotlib.pyplot.show()
dataframe
   C1  C2
0   3   8
1   2   2
2   6   9
3   9   7
4   8   2
show plot
Display dataframe plot
Drop columns with empty values
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
dataframe['C2'] = numpy.nan
print ("dataframe")
print (dataframe)
# option with inplace will change the same dataframe
dataframe2 = dataframe.dropna(axis=1)
print ("dataframe drop empty column")
print (dataframe2)
dataframe
   C1  C2  C3
0  86 NaN   8
1  40 NaN  39
2  35 NaN  31
3  52 NaN  10
4  73 NaN  41
dataframe drop empty column
   C1  C3
0  86   8
1  40  39
2  35  31
3  52  10
4  73  41
Drop duplicates
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.drop_duplicates('C1', keep='first')
print ("dataframe without duplicates")
print (dataframe2)
dataframe
   C1  C2
0   0   6
1   0   9
2   3   3
3   3   0
4   4   6
dataframe without duplicates
   C1  C2
0   0   6
2   3   3
4   4   6
Drop rows with empty values
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
dataframe.loc[3] = numpy.nan
print ("dataframe")
print (dataframe)
# option with inplace will change the same dataframe
dataframe2 = dataframe.dropna(axis=0)
print ("dataframe drop empty row")
print (dataframe2)
dataframe
     C1    C2    C3
0  81.0  65.0  92.0
1  75.0  15.0  46.0
2  44.0  96.0   7.0
3   NaN   NaN   NaN
4  34.0  95.0  76.0
dataframe drop empty row
     C1    C2    C3
0  81.0  65.0  92.0
1  75.0  15.0  46.0
2  44.0  96.0   7.0
4  34.0  95.0  76.0
Fill missing/empty values
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
dataframe['C2'] = numpy.nan
dataframe.loc[3] = numpy.nan
print ("dataframe")
print (dataframe)
# option with inplace will change the same dataframe
dataframe2 = dataframe.fillna(77)
print ("dataframe with filled empty values")
print (dataframe2)
dataframe
     C1  C2    C3
0  11.0 NaN  41.0
1  14.0 NaN   9.0
2  79.0 NaN   5.0
3   NaN NaN   NaN
4   5.0 NaN  70.0
dataframe with filled empty values
     C1    C2    C3
0  11.0  77.0  41.0
1  14.0  77.0   9.0
2  79.0  77.0   5.0
3  77.0  77.0  77.0
4   5.0  77.0  70.0
Filter rows or columns
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.filter(items=[1,3],axis=0)
print ("dataframe selected items")
print (dataframe2)
dataframe3 = dataframe.filter(items=['C1','C3'],axis=1)
print ("dataframe selected items")
print (dataframe3)
dataframe
   C1  C2  C3
0   8   2   8
1   5   7   7
2   9   5   0
3   6   8   1
4   0   1   9
dataframe selected items
   C1  C2  C3
1   5   7   7
3   6   8   1
dataframe selected items
   C1  C3
0   8   8
1   5   7
2   9   0
3   6   1
4   0   9
Get info axis
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print ("Info axis")
print (dataframe.keys())
dataframe
   C1  C2
0  15  99
1  40   2
2   5  79
Info axis
Index(['C1', 'C2'], dtype='object')
Group by column using sum
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
aggregate = dataframe.groupby('C1').sum()
print ("aggregated data")
print (aggregate)
dataframe
   C1  C2  C3
0   2   4   2
1   2   1   1
2   4   1   4
3   3   1   1
4   3   2   0
aggregated data
    C2  C3
C1        
2    5   3
3    3   1
4    1   4
Import pandas
import pandas

# Pandas by convention is usually imported as pd  (import pandas as pd). 
# Here and in other cheats we use full name to avoid confusion

                  
              
Inner/outer dataframes merge
dataframe1 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = pandas.merge(dataframe1, dataframe2, on='C1', how='inner')
print ("inner merged dataframes")
print (dataframe3)
dataframe4 = pandas.merge(dataframe1, dataframe2, on='C1', how='outer')
print ("outer merged dataframes")
print (dataframe4)
dataframe1
   C1  C2
0   3   3
1   0   1
2   1   3
dataframe2
   C1  C2
0   3   2
1   3   0
2   3   3
inner merged dataframes
   C1  C2_x  C2_y
0   3     3     2
1   3     3     0
2   3     3     3
outer merged dataframes
   C1  C2_x  C2_y
0   3     3   2.0
1   3     3   0.0
2   3     3   3.0
3   0     1   NaN
4   1     3   NaN
Iterate over column name / series pairs
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print (10*'-')
print ("Iteration \n")
for col_idx,data in dataframe.iteritems():
    print ("column:",col_idx)
    print ("column data:")
    print (data,"\n")
dataframe
   C1  C2
0  58  37
1  31   0
2  53  60
----------
Iteration 

column: C1
column data:
0    58
1    31
2    53
Name: C1, dtype: int64 

column: C2
column data:
0    37
1     0
2    60
Name: C2, dtype: int64
Iterate over columns
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print (10*'-')
print ("Iteration \n")
for col_idx,data in dataframe.iteritems():
    print ("column:",col_idx)
    print ("column data:")
    print (data,"\n")
dataframe
   C1  C2
0  63  20
1  25  47
2  60  65
----------
Iteration 

column: C1
column data:
0    63
1    25
2    60
Name: C1, dtype: int64 

column: C2
column data:
0    20
1    47
2    65
Name: C2, dtype: int64
Iterate over rows producing data series
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print (10*'-')
print ("Iteration \n")
for col_idx,data in dataframe.iterrows():
    print ("row:",col_idx)
    print ("row data:")
    print (data,"\n")
dataframe
   C1  C2
0  47  15
1  68  59
2   8  96
----------
Iteration 

row: 0
row data:
C1    47
C2    15
Name: 0, dtype: int64 

row: 1
row data:
C1    68
C2    59
Name: 1, dtype: int64 

row: 2
row data:
C1     8
C2    96
Name: 2, dtype: int64
Iterate over rows producing tuples
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
print (10*'-')
print ("Iteration \n")
for data in dataframe.itertuples():
    print ("row data:")
    print (data,"\n")
dataframe
   C1  C2
0  39  89
1  54  76
2  93  34
----------
Iteration 

row data:
Pandas(Index=0, C1=39, C2=89) 

row data:
Pandas(Index=1, C1=54, C2=76) 

row data:
Pandas(Index=2, C1=93, C2=34)
Join dataframes
dataframe1 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C3','C4'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = dataframe1.join(dataframe2,how='right')
print ("joined dataframes")
print (dataframe3)
dataframe1
   C1  C2
0   3   1
1   1   1
2   2   0
dataframe2
   C3  C4
0   0   3
1   2   3
2   0   3
joined dataframes
   C1  C2  C3  C4
0   3   1   0   3
1   1   1   2   3
2   2   0   0   3
Left/right dataframes merge
dataframe1 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = pandas.merge(dataframe1, dataframe2, on='C1', how='left')
print ("left merged dataframes")
print (dataframe3)
dataframe4 = pandas.merge(dataframe1, dataframe2, on='C1', how='right')
print ("right merged dataframes")
print (dataframe4)
dataframe1
   C1  C2
0   0   3
1   0   0
2   1   3
dataframe2
   C1  C2
0   2   3
1   0   3
2   3   1
left merged dataframes
   C1  C2_x  C2_y
0   0     3   3.0
1   0     0   3.0
2   1     3   NaN
right merged dataframes
   C1  C2_x  C2_y
0   0   3.0     3
1   0   0.0     3
2   2   NaN     3
3   3   NaN     1
Melt columns
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.melt()
print ("melted dataframe")
print (dataframe2)
dataframe
   C1  C2
0  83  62
1  18  20
2  78  94
melted dataframe
  variable  value
0       C1     83
1       C1     18
2       C1     78
3       C2     62
4       C2     20
5       C2     94
Number of dataframe dimensions
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("number of dataframe dimensions")
print (dataframe.ndim)
dataframe
   C1  C2  C3
0   6   5   9
1   2   7   2
2   0   4   2
3   1   9   6
4   8   8   4
number of dataframe dimensions
2
Number of elements (size)
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe number of elements")
print (dataframe.size)
dataframe
   C1  C2  C3
0   2   8   2
1   9   7   2
2   9   4   3
3   5   5   7
4   0   1   0
dataframe size
15
Pivot table
dataframe = pandas.DataFrame(numpy.random.randint(0,4,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
pivot = pandas.pivot_table(dataframe, values='C3', index=['C1'], columns=['C2'], aggfunc=sum)
print ("pivot")
print (pivot)
dataframe
   C1  C2  C3
0   1   0   3
1   1   3   0
2   1   0   2
3   2   0   2
4   1   2   2
pivot
C2    0    2    3
C1               
1   5.0  2.0  0.0
2   2.0  NaN  NaN
Print dataframe info
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe info")
print (dataframe.info())
dataframe
   C1  C2  C3
0   3   5   5
1   0   8   2
2   8   1   1
3   9   5   4
4   4   7   0
dataframe info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
C1    5 non-null int64
C2    5 non-null int64
C3    5 non-null int64
dtypes: int64(3)
memory usage: 200.0 bytes
None
Print dataframe shape
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
shape = dataframe.shape
print ("dataframe shape")
print (shape)
print ("shape axis 0")
print (shape[0])
print ("shape axis 1")
print (shape[1])
dataframe
   C1  C2  C3  C4
0  20   7  74  62
1  40  70  16  12
2  48  34  42  98
3  66  86  41  74
4  79  33   7  56
dataframe shape
(5, 4)
shape axis 0
5
shape axis 1
4
Print pandas version
print ("Pandas version",pandas.__version__)
Pandas version 0.20.2
Reindex
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.reindex([3,4,1,2,3])
print ("dataframe renamed")
print (dataframe2)
dataframe
   C1  C2  C3
0   4   8   9
1   4   2   1
2   4   9   8
3   3   2   3
4   8   7   5
dataframe renamed
   C1  C2  C3
3   3   2   3
4   8   7   5
1   4   2   1
2   4   9   8
3   3   2   3
Remove (drop) row
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
# second parameter is axis, for rows is 0
dataframe = dataframe.drop(2, 0)
print ("dataframe with removed row 2")
print (dataframe)
dataframe
   C1  C2  C3
0  14  32  53
1  41  96  84
2  11  90  85
3  52  11  13
4  94  72  33
dataframe with removed row 2
   C1  C2  C3
0  14  32  53
1  41  96  84
3  52  11  13
4  94  72  33
Rename column
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
# option without inplace will return new dataframe
dataframe.rename(columns={'C1':'C100'},inplace=True)
print ("dataframe renamed column")
print (dataframe)
dataframe
   C1  C2
0  65  42
1  37  12
2  84   3
3  42  54
4  55  45
dataframe renamed column
   C100  C2
0    65  42
1    37  12
2    84   3
3    42  54
4    55  45
Replace values
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.replace(1,-100)
print ("aggregated data")
print (dataframe2)
dataframe
   C1  C2  C3
0   3   1   3
1   1   3   4
2   4   1   0
3   4   0   3
4   4   0   2
aggregated data
    C1   C2  C3
0    3 -100   3
1 -100    3   4
2    4 -100   0
3    4    0   3
4    4    0   2
Reset index
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
dataframe = dataframe.drop(2, 0)
dataframe = dataframe.drop(3, 0)
print ("dataframe")
print (dataframe)
print ("dataframe reindex")
print (dataframe.reset_index(drop=True))
dataframe
   C1  C2  C3
0   7   8   4
1   9   6   1
4   7   9   5
dataframe reindex
   C1  C2  C3
0   7   8   4
1   9   6   1
2   7   9   5
Save & load data from CSV file
# create dataframe
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
# save dataframe to csv
dataframe.to_csv('dataframe.csv')
# list directory
for entry in os.scandir('.'):
    print(entry.name)
# load dataframe from csv
loaded_dataframe = pandas.read_csv('dataframe.csv',index_col=0)
print ("loaded dataframe")
print (loaded_dataframe)
dataframe
   C1  C2  C3  C4
0  98  82  43  95
1  61  76  19  42
2   3  59  27  40
3  88  13  36  45
4  29  12  43  24
dataframe.csv
loaded dataframe
   C1  C2  C3  C4
0  98  82  43  95
1  61  76  19  42
2   3  59  27  40
3  88  13  36  45
4  29  12  43  24
Save & load data from Excel file
# requires instaled Python openpyxl module
# create dataframe
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
# save dataframe to Excel
dataframe.to_excel('dataframe.xlsx',sheet_name='sheet01')
# list directory
for entry in os.scandir('.'):
    print(entry.name)
# load dataframe from csv
loaded_dataframe = pandas.read_excel('dataframe.xlsx',sheet_name='sheet01')
print ("loaded dataframe")
print (loaded_dataframe)
dataframe
   C1  C2  C3  C4
0  81  13  27  17
1  58   0  44  94
2  19   0  49  87
3  12  56  93  53
4  47  92  10  11
dataframe.xlsx
loaded dataframe
   C1  C2  C3  C4
0  81  13  27  17
1  58   0  44  94
2  19   0  49  87
3  12  56  93  53
4  47  92  10  11
Select column unique values
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("dataframe column unique values")
print (dataframe['C1'].unique())
dataframe
   C1  C2  C3  C4
0  97  77  28  12
1  61  58  86  64
2   0  12   0  62
3   0  66  88  34
4  16   6  51  60
dataframe column unique values
[97 61  0 16]
Select columns
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("dataframe selected columns")
print (dataframe[['C1','C2']])
dataframe
   C1  C2  C3  C4
0  47  34  32  96
1  30  25  73  18
2  61  17  27   9
3  74  23  15  73
4  79  49  42  44
dataframe selected columns
   C1  C2
0  47  34
1  30  25
2  61  17
3  74  23
4  79  49
Select columns using condition with all/any
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.loc[:,(dataframe>50).any()]
print ("dataframe selected columns")
print (dataframe2)
dataframe3 = dataframe.loc[:,(dataframe>50).all()]
print ("dataframe selected columns")
print (dataframe3)
dataframe
   C1  C2  C3
0  80  41  76
1  38   6  57
2  45  14  84
dataframe selected columns
   C1  C3
0  80  76
1  38  57
2  45  84
dataframe selected columns
   C3
0  76
1  57
2  84
Select columns with null/not null values
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 3)), columns=['C1','C2','C3'])
dataframe['C2'][1] = numpy.NaN
dataframe['C3'] = numpy.NaN
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.loc[:,dataframe.isnull().any()]
print ("dataframe selected columns")
print (dataframe2)
dataframe3 = dataframe.loc[:,dataframe.notnull().all()]
print ("dataframe selected columns")
print (dataframe3)
dataframe
   C1    C2  C3
0  91   7.0 NaN
1  77   NaN NaN
2  54  62.0 NaN
dataframe selected columns
     C2  C3
0   7.0 NaN
1   NaN NaN
2  62.0 NaN
dataframe selected columns
   C1    C2
0  91   7.0
1  77   NaN
2  54  62.0
Select data using condition in list
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.loc[dataframe.C1.isin([1,2,3])]
print ("dataframe selected columns")
print (dataframe2)
dataframe
   C1  C2  C3
0   8   0   4
1   2   0   5
2   5   4   5
3   6   4   1
4   3   5   9
dataframe selected columns
   C1  C2  C3
1   2   0   5
4   3   5   9
Select data using multiple conditions
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("dataframe conditional selection")
print (dataframe[(dataframe['C1']>50) & ((dataframe['C2']<25)| (dataframe['C2']>75))])
dataframe
   C1  C2  C3  C4
0  72  23  21   1
1  59  67  77  59
2  47  49  34   6
3  28   8  66  97
4  91   4  44  37
dataframe conditional selection
   C1  C2  C3  C4
0  72  23  21   1
4  91   4  44  37
Select data using query
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.query('C1 > C2')
print ("dataframe selected items")
print (dataframe2)
dataframe
   C1  C2  C3
0   7   3   2
1   1   6   8
2   9   7   3
3   3   2   7
4   4   6   0
dataframe selected items
   C1  C2  C3
0   7   3   2
2   9   7   3
3   3   2   7
Select data using single condition
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("dataframe conditional selection")
print (dataframe[dataframe['C1']>50])
dataframe
   C1  C2  C3  C4
0  86  13   3  83
1  63  58   1  24
2  80  87  52  40
3  34  35  37  43
4  24  30  83  11
dataframe conditional selection
   C1  C2  C3  C4
0  86  13   3  83
1  63  58   1  24
2  80  87  52  40
Select items using where condition
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.where(dataframe>5)
print ("dataframe selected items")
print (dataframe2)
dataframe
   C1  C2  C3
0   9   6   4
1   7   7   0
2   1   7   7
3   4   1   5
4   6   2   6
dataframe selected items
    C1   C2   C3
0  9.0  6.0  NaN
1  7.0  7.0  NaN
2  NaN  7.0  7.0
3  NaN  NaN  NaN
4  6.0  NaN  6.0
Select row
df_index = [101,202,303,404,505]
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'],index = df_index)
print ("dataframe")
print (dataframe)
print ("row 3 with label 303")
print (dataframe.loc[303])
dataframe
     C1  C2  C3
101  88  84  54
202  15  99  10
303   6  57  57
404  23  51  51
505  92  12  36
row 3 with label 303
C1     6
C2    57
C3    57
Name: 303, dtype: int64
Select row by index number
df_index = [101,202,303,404,505]
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'],index = df_index)
print ("dataframe")
print (dataframe)
print ("row 3")
print (dataframe.iloc[2])
dataframe
     C1  C2  C3
101  26  73  73
202  14  68  61
303   6   1   1
404  40  83   1
505  49  84  31
row 2
C1    6
C2    1
C3    1
Name: 303, dtype: int64
Select rows using condition
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
# single contition
dataframe2 = dataframe.loc[dataframe['C1'] > 50]
print ("dataframe selected rows")
print (dataframe2)
# multiple conditions
dataframe3 = dataframe.loc[(dataframe['C1'] > 20) & (dataframe['C1'] < 50)]
print ("dataframe selected rows")
print (dataframe3)
dataframe
   C1  C2  C3
0  24  55  55
1  84  97  32
2  65  15  83
3  11  31  95
4  41  96  70
dataframe selected rows
   C1  C2  C3
1  84  97  32
2  65  15  83
dataframe selected rows
   C1  C2  C3
0  24  55  55
4  41  96  70
Select rows using function
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
# select elements with even index number
dataframe2 = dataframe.select(lambda x: x%2==0)
print ("dataframe selected items")
print (dataframe2)
dataframe
   C1  C2  C3
0   5   6   0
1   1   2   0
2   9   1   7
3   6   2   7
4   0   7   9
dataframe selected items
   C1  C2  C3
4   0   7   9
Set index
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe index")
print (dataframe.index.values)
dataframe.set_index('C2',inplace = True)
print ("dataframe new index")
print (dataframe.index.values)
dataframe
   C1  C2  C3
0   8   6   9
1   1   2   0
2   2   4   5
3   0   1   7
4   3   2   2
dataframe index
[0 1 2 3 4]
dataframe new index
[6 2 4 1 2]
Sort dataframe along axis
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
sorted_dataframe = dataframe.sort_values(by=['C1','C2'],axis=0)
print ("sorted dataframe")
print (sorted_dataframe)
dataframe
   C1  C2  C3  C4
0   2   1   2   2
1   4   0   3   2
2   3   2   4   4
3   1   1   0   2
4   1   2   2   4
sorted dataframe
   C1  C2  C3  C4
3   1   1   0   2
4   1   2   2   4
0   2   1   2   2
2   3   2   4   4
1   4   0   3   2
Stack & unstack columns
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.stack()
print ("stacked dataframe")
print (dataframe2)
dataframe3 = dataframe2.unstack()
print ("unstacked dataframe")
print (dataframe3)
dataframe
   C1  C2
0  51  54
1  44  72
2  66  35
stacked dataframe
0  C1    51
   C2    54
1  C1    44
   C2    72
2  C1    66
   C2    35
dtype: int64
unstacked dataframe
   C1  C2
0  51  54
1  44  72
2  66  35
Transpose dataframe
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("transposed dataframe")
print (dataframe.T)
dataframe
   C1  C2  C3
0   5   0   9
1   0   2   7
2   9   6   8
3   2   1   1
4   6   5   5
transposed dataframe
    0  1  2  3  4
C1  5  0  9  2  6
C2  0  2  6  1  5
C3  9  7  8  1  5
View columns types
dataframe = pandas.DataFrame({
    'C1': pandas.date_range('20170101', periods=4),
    'C2' : [10,20,30,40],
    'C3': pandas.Categorical(['A','B','C','D']),
    'C4': 1})
print ("dataframe column types")
print (dataframe.dtypes)
dataframe column types
C1    datetime64[ns]
C2             int64
C3          category
C4             int64
dtype: object
View dataframe columns
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("dataframe columns")
print (dataframe.columns)
dataframe
   C1  C2  C3  C4
0  54  50  37  74
1  46  76   8  96
2  43  92  25  69
3  55  64  58   5
4  58  79   6  39
dataframe columns
Index(['C1', 'C3', 'C3', 'C4'], dtype='object')
View dataframe head
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("dataframe head")
print (dataframe.head(2))
dataframe
   C1  C2  C3  C4
0  59  94   5  79
1  67  43  21  88
2  74  68  75  79
3  62  40  75  59
4  99  76  18  84
dataframe head
   C1  C3  C3  C4
0  59  94   5  79
1  67  43  21  88
View dataframe tail
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
print ("dataframe head")
print (dataframe.tail(2))
dataframe
   C1  C2  C3  C4
0  70  85   7  80
1  15  82  58  57
2  84  55  88   9
3  60   6   6  67
4  55  74  64  10
dataframe tail
   C1  C3  C3  C4
3  60   6   6  67
4  55  74  64  10