Pandas - DataFrames

Create a dataframe using random numbers, index and columns

select the values of a particular column
Select the values of multiple columns
Check the type of columns
Create a new column (sum of two columns)
Remove the column using axis -- drop
remove the row using axis -- drop
inplace parameter
shape of dataframe
Select the values of a row -- loc and iloc
Select a particular value from the dataframe -- [row, column]
select the values of particular rows from the specific columns

Conditional selection -- boolean values and real values
conditional selection on dataframe
conditional selection on particular column
select values of a particular column after conditional selection
Select the data based on two conditions
pandas operators (&, | )
reset_index()
set_index()

import numpy as np
import pandas as pd

from numpy.random import randn

randn(1)

array([1.05063801])

df = pd.DataFrame(data = randn(5,4), index = ['A','B','C','D','E'], columns=['W','X','Y','Z'])

print(df)

          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

print(df['Y'])

A   -0.202898
B    0.649539
C   -1.367750
D   -1.367424
E   -0.405546
Name: Y, dtype: float64

print(df[['Y','Z']])

          Y         Z
A -0.202898  0.327163
B  0.649539 -1.350474
C -1.367750  0.284982
D -1.367424 -0.899680
E -0.405546  0.155646

type(df['Y'])

pandas.core.series.Series

type(df)

pandas.core.frame.DataFrame

df['S'] = df['W'] + df['X']

print(df)

          W         X         Y         Z         S
A -1.560803  0.230437 -0.202898  0.327163 -1.330365
B -1.081210  0.208696  0.649539 -1.350474 -0.872514
C -1.633450  0.262663 -1.367750  0.284982 -1.370787
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521
E  1.235248 -0.135664 -0.405546  0.155646  1.099585

df['T'] = [1,2,3,4,5]

print(df)

          W         X         Y         Z         S  T
A -1.560803  0.230437 -0.202898  0.327163 -1.330365  1
B -1.081210  0.208696  0.649539 -1.350474 -0.872514  2
C -1.633450  0.262663 -1.367750  0.284982 -1.370787  3
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521  4
E  1.235248 -0.135664 -0.405546  0.155646  1.099585  5

del df['T']

print(df)

          W         X         Y         Z         S
A -1.560803  0.230437 -0.202898  0.327163 -1.330365
B -1.081210  0.208696  0.649539 -1.350474 -0.872514
C -1.633450  0.262663 -1.367750  0.284982 -1.370787
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521
E  1.235248 -0.135664 -0.405546  0.155646  1.099585

df.drop('S', axis= 1)

	W	X	Y	Z
A	-1.560803	0.230437	-0.202898	0.327163
B	-1.081210	0.208696	0.649539	-1.350474
C	-1.633450	0.262663	-1.367750	0.284982
D	-1.960716	-0.097805	-1.367424	-0.899680
E	1.235248	-0.135664	-0.405546	0.155646

print(df)

          W         X         Y         Z         S
A -1.560803  0.230437 -0.202898  0.327163 -1.330365
B -1.081210  0.208696  0.649539 -1.350474 -0.872514
C -1.633450  0.262663 -1.367750  0.284982 -1.370787
D -1.960716 -0.097805 -1.367424 -0.899680 -2.058521
E  1.235248 -0.135664 -0.405546  0.155646  1.099585

df.drop('S', axis= 1, inplace=True)

print(df)

          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

print(df.loc['E'])

W    1.235248
X   -0.135664
Y   -0.405546
Z    0.155646
Name: E, dtype: float64

print(df.iloc[4])

W    1.235248
X   -0.135664
Y   -0.405546
Z    0.155646
Name: E, dtype: float64

df.loc['F'] = [1,2,3,4]

print(df)

          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646
F  1.000000  2.000000  3.000000  4.000000

df1 = pd.DataFrame([[5,6,7,8],[9,10,11,12]],columns=['W','X','Y','Z'])

df1

	W	X	Y	Z
0	5	6	7	8
1	9	10	11	12

df2 = df.append(df1, ignore_index=True)

df

	W	X	Y	Z
A	-1.560803	0.230437	-0.202898	0.327163
B	-1.081210	0.208696	0.649539	-1.350474
C	-1.633450	0.262663	-1.367750	0.284982
D	-1.960716	-0.097805	-1.367424	-0.899680
E	1.235248	-0.135664	-0.405546	0.155646
F	1.000000	2.000000	3.000000	4.000000

df2

	W	X	Y	Z
0	-1.560803	0.230437	-0.202898	0.327163
1	-1.081210	0.208696	0.649539	-1.350474
2	-1.633450	0.262663	-1.367750	0.284982
3	-1.960716	-0.097805	-1.367424	-0.899680
4	1.235248	-0.135664	-0.405546	0.155646
5	1.000000	2.000000	3.000000	4.000000
6	5.000000	6.000000	7.000000	8.000000
7	9.000000	10.000000	11.000000	12.000000

df.shape

(6, 4)

df.drop('F', axis=0, inplace= True)

print(df)

          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

print(df.loc['A','Y'])

-0.20289846684106513

print(df['Y'].loc['A'])

-0.20289846684106513

print(df.loc['A']['Y'])

-0.20289846684106513

print(df.loc[['A','B'],['Y','Z']])

          Y         Z
A -0.202898  0.327163
B  0.649539 -1.350474

print(df)

          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

bool_df = df > 0

bool_df

	W	X	Y	Z
A	False	True	False	True
B	False	True	True	False
C	False	True	False	True
D	False	False	False	False
E	True	False	False	True

df[bool_df]

	W	X	Y	Z
A	NaN	0.230437	NaN	0.327163
B	NaN	0.208696	0.649539	NaN
C	NaN	0.262663	NaN	0.284982
D	NaN	NaN	NaN	NaN
E	1.235248	NaN	NaN	0.155646

df[df>0]

	W	X	Y	Z
A	NaN	0.230437	NaN	0.327163
B	NaN	0.208696	0.649539	NaN
C	NaN	0.262663	NaN	0.284982
D	NaN	NaN	NaN	NaN
E	1.235248	NaN	NaN	0.155646

df['Y'] > 0

A    False
B     True
C    False
D    False
E    False
Name: Y, dtype: bool

df[df['Y']>0]

	W	X	Y	Z
B	-1.08121	0.208696	0.649539	-1.350474

df['Y'][df['Y']>0]

B    0.649539
Name: Y, dtype: float64

result = df[df['X']>0]

result

	W	X	Y	Z
A	-1.560803	0.230437	-0.202898	0.327163
B	-1.081210	0.208696	0.649539	-1.350474
C	-1.633450	0.262663	-1.367750	0.284982

result[result['Z'] > 0 ]

	W	X	Y	Z
A	-1.560803	0.230437	-0.202898	0.327163
C	-1.633450	0.262663	-1.367750	0.284982

result[result['Z'] > 0][['X','Y']]

	X	Y
A	0.230437	-0.202898
C	0.262663	-1.367750

(df['X'] > 0) & (df['Z']>0)

A     True
B    False
C     True
D    False
E    False
dtype: bool

print(df)

          W         X         Y         Z
A -1.560803  0.230437 -0.202898  0.327163
B -1.081210  0.208696  0.649539 -1.350474
C -1.633450  0.262663 -1.367750  0.284982
D -1.960716 -0.097805 -1.367424 -0.899680
E  1.235248 -0.135664 -0.405546  0.155646

df[(df['X'] > 0) & (df['Z']>0)]

	W	X	Y	Z
A	-1.560803	0.230437	-0.202898	0.327163
C	-1.633450	0.262663	-1.367750	0.284982

df[(df['X'] > 0) & (df['Z']>0)][['X','Z']]

	X	Z
A	0.230437	0.327163
C	0.262663	0.284982

df[['X','Z']][(df['X'] > 0) & (df['Z']>0)]

	X	Z
A	0.230437	0.327163
C	0.262663	0.284982

df

	W	X	Y	Z
A	-1.560803	0.230437	-0.202898	0.327163
B	-1.081210	0.208696	0.649539	-1.350474
C	-1.633450	0.262663	-1.367750	0.284982
D	-1.960716	-0.097805	-1.367424	-0.899680
E	1.235248	-0.135664	-0.405546	0.155646

df.reset_index()

	index	W	X	Y	Z
0	A	-1.560803	0.230437	-0.202898	0.327163
1	B	-1.081210	0.208696	0.649539	-1.350474
2	C	-1.633450	0.262663	-1.367750	0.284982
3	D	-1.960716	-0.097805	-1.367424	-0.899680
4	E	1.235248	-0.135664	-0.405546	0.155646

df['States'] = ['IL', 'CA', 'TX', 'OH', 'FL']

df

	W	X	Y	Z	States
A	-1.560803	0.230437	-0.202898	0.327163	IL
B	-1.081210	0.208696	0.649539	-1.350474	CA
C	-1.633450	0.262663	-1.367750	0.284982	TX
D	-1.960716	-0.097805	-1.367424	-0.899680	OH
E	1.235248	-0.135664	-0.405546	0.155646	FL

df2 = df.set_index('States')

print(df2)

               W         X         Y         Z
States                                        
IL     -1.560803  0.230437 -0.202898  0.327163
CA     -1.081210  0.208696  0.649539 -1.350474
TX     -1.633450  0.262663 -1.367750  0.284982
OH     -1.960716 -0.097805 -1.367424 -0.899680
FL      1.235248 -0.135664 -0.405546  0.155646

df2.loc['FL']

W    1.235248
X   -0.135664
Y   -0.405546
Z    0.155646
Name: FL, dtype: float64