Pandas > Modify & transform data

Name
Code
Output
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe added new columns")
dataframe['C1p10']=dataframe['C1'] + 10
dataframe['C101']=101
print (dataframe)
dataframe
   C1  C2  C3
0  14  81  99
1  33  89  95
2  60  27  21
3  19  85  44
4  20  62  57
dataframe added new columns
   C1  C2  C3  C1p10  C101
0  14  81  99     24   101
1  33  89  95     43   101
2  60  27  21     70   101
3  19  85  44     29   101
4  20  62  57     30   101
dataframe1 = pandas.DataFrame(numpy.random.randint(0,10,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,10,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = dataframe1.append(dataframe2,ignore_index = True)
print ("appended dataframes")
print (dataframe3)
dataframe1
   C1  C2
0   4   5
1   9   1
2   3   6
dataframe2
   C1  C2
0   3   8
1   8   8
2   5   6
appended dataframes
   C1  C2
0   4   5
1   9   1
2   3   6
3   3   8
4   8   8
5   5   6
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
# cross-tabulation of two factors (default is a frequency table) 
dataframe['C1'] = dataframe['C1'].map(lambda x: x-100)
print ("modified dataframe")
print (dataframe)
dataframe
   C1  C2
0  27  35
1  46  38
2  21  98
3  34  30
4  76   8
modified dataframe
   C1  C2
0 -73  35
1 -54  38
2 -79  98
3 -66  30
4 -24   8
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("numpy array")
print (dataframe.values)
dataframe
   C1  C2  C3
0   3   0   0
1   9   8   0
2   4   3   5
3   4   5   9
4   5   6   7
numpy array
[[3 0 0]
 [9 8 0]
 [4 3 5]
 [4 5 9]
 [5 6 7]]
dataframe1 = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = pandas.concat([dataframe1,dataframe2])
print ("concatenated dataframes")
print (dataframe3)
dataframe1
   C1  C2
0  37  43
1   8  19
2  69  23
dataframe2
   C1  C2
0   1  20
1  63   8
2  82  41
concatenated dataframes
   C1  C2
0  37  43
1   8  19
2  69  23
0   1  20
1  63   8
2  82  41
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
new_dataframe = dataframe.transform(lambda x: x*100)
print ("transformed data")
print (new_dataframe)
dataframe
   C1  C2  C3
0   3   4   4
1   4   3   3
2   2   3   2
3   3   0   3
4   4   3   2
transformed data
    C1   C2   C3
0  300  400  400
1  400  300  300
2  200  300  200
3  300    0  300
4  400  300  200
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
# second parameter is axis, for columns is 1
dataframe = dataframe.drop('C1', 1)
print ("dataframe with removed column 1")
print (dataframe)
dataframe
   C1  C2  C3
0  73  97  36
1  97  79   8
2  22  10  80
3   0  92  69
4  54   8  75
dataframe with removed column 1
   C2  C3
0  97  36
1  79   8
2  10  80
3  92  69
4   8  75
dataframe1 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = pandas.merge(dataframe1, dataframe2, on='C1', how='inner')
print ("inner merged dataframes")
print (dataframe3)
dataframe4 = pandas.merge(dataframe1, dataframe2, on='C1', how='outer')
print ("outer merged dataframes")
print (dataframe4)
dataframe1
   C1  C2
0   3   3
1   0   1
2   1   3
dataframe2
   C1  C2
0   3   2
1   3   0
2   3   3
inner merged dataframes
   C1  C2_x  C2_y
0   3     3     2
1   3     3     0
2   3     3     3
outer merged dataframes
   C1  C2_x  C2_y
0   3     3   2.0
1   3     3   0.0
2   3     3   3.0
3   0     1   NaN
4   1     3   NaN
dataframe1 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C3','C4'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = dataframe1.join(dataframe2,how='right')
print ("joined dataframes")
print (dataframe3)
dataframe1
   C1  C2
0   3   1
1   1   1
2   2   0
dataframe2
   C3  C4
0   0   3
1   2   3
2   0   3
joined dataframes
   C1  C2  C3  C4
0   3   1   0   3
1   1   1   2   3
2   2   0   0   3
dataframe1 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
dataframe2 = pandas.DataFrame(numpy.random.randint(0,4,size=(3, 2)), columns=['C1','C2'])
print ("dataframe1")
print (dataframe1)
print ("dataframe2")
print (dataframe2)
dataframe3 = pandas.merge(dataframe1, dataframe2, on='C1', how='left')
print ("left merged dataframes")
print (dataframe3)
dataframe4 = pandas.merge(dataframe1, dataframe2, on='C1', how='right')
print ("right merged dataframes")
print (dataframe4)
dataframe1
   C1  C2
0   0   3
1   0   0
2   1   3
dataframe2
   C1  C2
0   2   3
1   0   3
2   3   1
left merged dataframes
   C1  C2_x  C2_y
0   0     3   3.0
1   0     0   3.0
2   1     3   NaN
right merged dataframes
   C1  C2_x  C2_y
0   0   3.0     3
1   0   0.0     3
2   2   NaN     3
3   3   NaN     1
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.melt()
print ("melted dataframe")
print (dataframe2)
dataframe
   C1  C2
0  83  62
1  18  20
2  78  94
melted dataframe
  variable  value
0       C1     83
1       C1     18
2       C1     78
3       C2     62
4       C2     20
5       C2     94
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.reindex([3,4,1,2,3])
print ("dataframe renamed")
print (dataframe2)
dataframe
   C1  C2  C3
0   4   8   9
1   4   2   1
2   4   9   8
3   3   2   3
4   8   7   5
dataframe renamed
   C1  C2  C3
3   3   2   3
4   8   7   5
1   4   2   1
2   4   9   8
3   3   2   3
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
# second parameter is axis, for rows is 0
dataframe = dataframe.drop(2, 0)
print ("dataframe with removed row 2")
print (dataframe)
dataframe
   C1  C2  C3
0  14  32  53
1  41  96  84
2  11  90  85
3  52  11  13
4  94  72  33
dataframe with removed row 2
   C1  C2  C3
0  14  32  53
1  41  96  84
3  52  11  13
4  94  72  33
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(5, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
# option without inplace will return new dataframe
dataframe.rename(columns={'C1':'C100'},inplace=True)
print ("dataframe renamed column")
print (dataframe)
dataframe
   C1  C2
0  65  42
1  37  12
2  84   3
3  42  54
4  55  45
dataframe renamed column
   C100  C2
0    65  42
1    37  12
2    84   3
3    42  54
4    55  45
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.replace(1,-100)
print ("aggregated data")
print (dataframe2)
dataframe
   C1  C2  C3
0   3   1   3
1   1   3   4
2   4   1   0
3   4   0   3
4   4   0   2
aggregated data
    C1   C2  C3
0    3 -100   3
1 -100    3   4
2    4 -100   0
3    4    0   3
4    4    0   2
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
dataframe = dataframe.drop(2, 0)
dataframe = dataframe.drop(3, 0)
print ("dataframe")
print (dataframe)
print ("dataframe reindex")
print (dataframe.reset_index(drop=True))
dataframe
   C1  C2  C3
0   7   8   4
1   9   6   1
4   7   9   5
dataframe reindex
   C1  C2  C3
0   7   8   4
1   9   6   1
2   7   9   5
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("dataframe index")
print (dataframe.index.values)
dataframe.set_index('C2',inplace = True)
print ("dataframe new index")
print (dataframe.index.values)
dataframe
   C1  C2  C3
0   8   6   9
1   1   2   0
2   2   4   5
3   0   1   7
4   3   2   2
dataframe index
[0 1 2 3 4]
dataframe new index
[6 2 4 1 2]
dataframe = pandas.DataFrame(numpy.random.randint(0,5,size=(5, 4)), columns=['C1','C2','C3','C4'])
print ("dataframe")
print (dataframe)
sorted_dataframe = dataframe.sort_values(by=['C1','C2'],axis=0)
print ("sorted dataframe")
print (sorted_dataframe)
dataframe
   C1  C2  C3  C4
0   2   1   2   2
1   4   0   3   2
2   3   2   4   4
3   1   1   0   2
4   1   2   2   4
sorted dataframe
   C1  C2  C3  C4
3   1   1   0   2
4   1   2   2   4
0   2   1   2   2
2   3   2   4   4
1   4   0   3   2
dataframe = pandas.DataFrame(numpy.random.randint(0,100,size=(3, 2)), columns=['C1','C2'])
print ("dataframe")
print (dataframe)
dataframe2 = dataframe.stack()
print ("stacked dataframe")
print (dataframe2)
dataframe3 = dataframe2.unstack()
print ("unstacked dataframe")
print (dataframe3)
dataframe
   C1  C2
0  51  54
1  44  72
2  66  35
stacked dataframe
0  C1    51
   C2    54
1  C1    44
   C2    72
2  C1    66
   C2    35
dtype: int64
unstacked dataframe
   C1  C2
0  51  54
1  44  72
2  66  35
dataframe = pandas.DataFrame(numpy.random.randint(0,10,size=(5, 3)), columns=['C1','C2','C3'])
print ("dataframe")
print (dataframe)
print ("transposed dataframe")
print (dataframe.T)
dataframe
   C1  C2  C3
0   5   0   9
1   0   2   7
2   9   6   8
3   2   1   1
4   6   5   5
transposed dataframe
    0  1  2  3  4
C1  5  0  9  2  6
C2  0  2  6  1  5
C3  9  7  8  1  5