30.Pandas dataframes

Create a dataframe using random numbers, index and columns

select the values of a particular column
Select the values of multiple columns
Check the type of columns
Create a new column (sum of two columns)
Remove the column using axis -- drop
remove the row using axis -- drop
inplace parameter
shape of dataframe
Select the values of a row -- loc and iloc
Select a particular value from the dataframe -- [row, column]
select the values of particular rows from the specific columns

Conditional selection -- boolean values and real values
conditional selection on dataframe
conditional selection on particular column
select values of a particular column after conditional selection
Select the data based on two conditions
pandas operators (&, | )
reset_index()
set_index()

import numpy as np
import pandas as pd
from numpy.random import randn
randn(1)
array([1.05063801])
df = pd.DataFrame(data = randn(5,4), index = ['A','B','C','D','E'], columns=['W','X','Y','Z'])
print(df)
          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

print(df['Y'])
A   -0.202898
B    0.649539
C   -1.367750
D   -1.367424
E   -0.405546
Name: Y, dtype: float64

print(df[['Y','Z']])
          Y         Z
A -0.202898  0.327163
B  0.649539 -1.350474
C -1.367750  0.284982
D -1.367424 -0.899680
E -0.405546  0.155646

type(df['Y'])
pandas.core.series.Series
type(df)
pandas.core.frame.DataFrame
df['S'] = df['W'] + df['X']
print(df)
          W         X         Y         Z         S
A -1.560803  0.230437 -0.202898  0.327163 -1.330365
B -1.081210  0.208696  0.649539 -1.350474 -0.872514
C -1.633450  0.262663 -1.367750  0.284982 -1.370787
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521
E  1.235248 -0.135664 -0.405546  0.155646  1.099585

df['T'] = [1,2,3,4,5]
print(df)
          W         X         Y         Z         S  T
A -1.560803  0.230437 -0.202898  0.327163 -1.330365  1
B -1.081210  0.208696  0.649539 -1.350474 -0.872514  2
C -1.633450  0.262663 -1.367750  0.284982 -1.370787  3
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521  4
E  1.235248 -0.135664 -0.405546  0.155646  1.099585  5

del df['T']
print(df)
          W         X         Y         Z         S
A -1.560803  0.230437 -0.202898  0.327163 -1.330365
B -1.081210  0.208696  0.649539 -1.350474 -0.872514
C -1.633450  0.262663 -1.367750  0.284982 -1.370787
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521
E  1.235248 -0.135664 -0.405546  0.155646  1.099585

df.drop('S', axis= 1)
W X Y Z
A -1.560803 0.230437 -0.202898 0.327163
B -1.081210 0.208696 0.649539 -1.350474
C -1.633450 0.262663 -1.367750 0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E 1.235248 -0.135664 -0.405546 0.155646
print(df)
          W         X         Y         Z         S
A -1.560803  0.230437 -0.202898  0.327163 -1.330365
B -1.081210  0.208696  0.649539 -1.350474 -0.872514
C -1.633450  0.262663 -1.367750  0.284982 -1.370787
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521
E  1.235248 -0.135664 -0.405546  0.155646  1.099585

df.drop('S', axis= 1, inplace=True)
print(df)
          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

print(df.loc['E'])
W    1.235248
X   -0.135664
Y   -0.405546
Z    0.155646
Name: E, dtype: float64

print(df.iloc[4])
W    1.235248
X   -0.135664
Y   -0.405546
Z    0.155646
Name: E, dtype: float64

df.loc['F'] = [1,2,3,4]
print(df)
          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646
F  1.000000  2.000000  3.000000  4.000000

df1 = pd.DataFrame([[5,6,7,8],[9,10,11,12]],columns=['W','X','Y','Z'])
df1
W X Y Z
0 5 6 7 8
1 9 10 11 12
df2 = df.append(df1, ignore_index=True)
df
W X Y Z
A -1.560803 0.230437 -0.202898 0.327163
B -1.081210 0.208696 0.649539 -1.350474
C -1.633450 0.262663 -1.367750 0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E 1.235248 -0.135664 -0.405546 0.155646
F 1.000000 2.000000 3.000000 4.000000
df2
W X Y Z
0 -1.560803 0.230437 -0.202898 0.327163
1 -1.081210 0.208696 0.649539 -1.350474
2 -1.633450 0.262663 -1.367750 0.284982
3 -1.960716 -0.097805 -1.367424 -0.899680
4 1.235248 -0.135664 -0.405546 0.155646
5 1.000000 2.000000 3.000000 4.000000
6 5.000000 6.000000 7.000000 8.000000
7 9.000000 10.000000 11.000000 12.000000
df.shape
(6, 4)
df.drop('F', axis=0, inplace= True)
print(df)
          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

print(df.loc['A','Y'])
-0.20289846684106513

print(df['Y'].loc['A'])
-0.20289846684106513

print(df.loc['A']['Y'])
-0.20289846684106513

print(df.loc[['A','B'],['Y','Z']])
          Y         Z
A -0.202898  0.327163
B  0.649539 -1.350474

print(df)
          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

bool_df = df > 0
bool_df
W X Y Z
A False True False True
B False True True False
C False True False True
D False False False False
E True False False True
df[bool_df]
W X Y Z
A NaN 0.230437 NaN 0.327163
B NaN 0.208696 0.649539 NaN
C NaN 0.262663 NaN 0.284982
D NaN NaN NaN NaN
E 1.235248 NaN NaN 0.155646
df[df>0]
W X Y Z
A NaN 0.230437 NaN 0.327163
B NaN 0.208696 0.649539 NaN
C NaN 0.262663 NaN 0.284982
D NaN NaN NaN NaN
E 1.235248 NaN NaN 0.155646
df['Y'] > 0
A    False
B     True
C    False
D    False
E    False
Name: Y, dtype: bool
df[df['Y']>0]
W X Y Z
B -1.08121 0.208696 0.649539 -1.350474
df['Y'][df['Y']>0]
B    0.649539
Name: Y, dtype: float64
result = df[df['X']>0]
result
W X Y Z
A -1.560803 0.230437 -0.202898 0.327163
B -1.081210 0.208696 0.649539 -1.350474
C -1.633450 0.262663 -1.367750 0.284982
result[result['Z'] > 0 ]
W X Y Z
A -1.560803 0.230437 -0.202898 0.327163
C -1.633450 0.262663 -1.367750 0.284982
result[result['Z'] > 0][['X','Y']]
X Y
A 0.230437 -0.202898
C 0.262663 -1.367750
(df['X'] > 0) & (df['Z']>0)
A     True
B    False
C     True
D    False
E    False
dtype: bool
print(df)
          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

df[(df['X'] > 0) & (df['Z']>0)]
W X Y Z
A -1.560803 0.230437 -0.202898 0.327163
C -1.633450 0.262663 -1.367750 0.284982
df[(df['X'] > 0) & (df['Z']>0)][['X','Z']]
X Z
A 0.230437 0.327163
C 0.262663 0.284982
df[['X','Z']][(df['X'] > 0) & (df['Z']>0)]
X Z
A 0.230437 0.327163
C 0.262663 0.284982
df
W X Y Z
A -1.560803 0.230437 -0.202898 0.327163
B -1.081210 0.208696 0.649539 -1.350474
C -1.633450 0.262663 -1.367750 0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E 1.235248 -0.135664 -0.405546 0.155646
df.reset_index()
index W X Y Z
0 A -1.560803 0.230437 -0.202898 0.327163
1 B -1.081210 0.208696 0.649539 -1.350474
2 C -1.633450 0.262663 -1.367750 0.284982
3 D -1.960716 -0.097805 -1.367424 -0.899680
4 E 1.235248 -0.135664 -0.405546 0.155646
df['States'] = ['IL', 'CA', 'TX', 'OH', 'FL']
df
W X Y Z States
A -1.560803 0.230437 -0.202898 0.327163 IL
B -1.081210 0.208696 0.649539 -1.350474 CA
C -1.633450 0.262663 -1.367750 0.284982 TX
D -1.960716 -0.097805 -1.367424 -0.899680 OH
E 1.235248 -0.135664 -0.405546 0.155646 FL
df2 = df.set_index('States')
print(df2)
               W         X         Y         Z
States                                        
IL     -1.560803  0.230437 -0.202898  0.327163
CA     -1.081210  0.208696  0.649539 -1.350474
TX     -1.633450  0.262663 -1.367750  0.284982
OH     -1.960716 -0.097805 -1.367424 -0.899680
FL      1.235248 -0.135664 -0.405546  0.155646

df2.loc['FL']
W    1.235248
X   -0.135664
Y   -0.405546
Z    0.155646
Name: FL, dtype: float64