0%

pandas 入门

第1章 Series()

1.1Series的创建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import numpy as np
import pandas as pd
##numpy
a=np.arange(16)
se=pd.Series(a,index=pd.date_range('20191012',periods=16))
print(se)
print(se['20191024'])
## python dict
dic={'name':'chendong','age':22,'gender':'男'}
se1=pd.Series(dic)
print(se1)
## python list
lis=['hello','world',1,2,3]
se2=pd.Series(lis)
print(se2)
# 标量值
se3=pd.Series(3,name='hw')
print(se3)

1.2 Series的索引

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import pandas as pd
import numpy as np
a=np.arange(16)
se=pd.Series(a,index=pd.date_range('20191012',periods=16))
print(se)
print('---------------')
print(se[0]) #0
print(se[1]) #1
print(se[2]) #2
print('----------------')
print(se[0:2]) ## 包括index
print('----------------')
print(se.values) #<class 'numpy.ndarray'>
print('----------------')
print(type(se.values))
b=se.values.reshape(4,4)
print(b)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import pandas as pd
import numpy as np
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(type(s))
print('-----------')
print(s.values)
print('-----------')
print(type(s.values)) ## <class 'numpy.ndarray'>
##将s的类型<class 'pandas.core.series.Series'>转化为<class 'numpy.ndarray'>
a=s.to_numpy()
print(a)
print(type(a))
print(s.index)
#索引和切片
print(s[0])
print(s[:3])
#多个索引
print(s[[4,3,1]])
#布尔选择
print(s[s>s.median()])

print(s.array)
print(type(s.array)) #<class 'pandas.core.arrays.numpy_.PandasArray'>
arr=s.array
for i in range(len(arr)):
print(arr[i])

第2章 DataFrame的增删查改

2.1.DataFrame的创建

2.1.1 由python的列表、字典以及numpy创建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import numpy as np
import pandas as pd

## 由python list创建
lis=[[1,2,3,4],[5,6,7,8],[9,10,11,12],[13,14,15,16]]
df=pd.DataFrame(lis)

## 由python 字典创建
dic={'A':[1,2,3],'B':[4,5,6],'C':[7,8,9]}
df1=pd.Dataframe(dic,index=list('abc))

## 由numpy创建
arr=np.arange(20).reshape(5,4)
df2=pd.DataFrame(arr)
df3=pd.DataFrame(np.random.rand(6,4),index=dates,columns=['col1','col2','col3','col4'])

2.1.2 由Series创建

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import numpy as np
import pandas as pd
s1=pd.Series([1,3,4,5,6,8],index=list('ABCDEF'),dtype='int32')
print(s1)
s2=pd.Series([1,3,4,np.nan,6,8],index=list('ABCDEF'))
print(s2)
df=pd.DataFrame([s1,s2])
print(df.T)

s3=pd.Series(np.array([1,2,3,4,5]),index=pd.date_range('20191011',periods=5))
s4=pd.Series(np.array([7,8,9,10,11]),index=pd.date_range('20191011',periods=5))
print(s3)
print(s4)
df2=pd.DataFrame([s3,s4])
print(df2.T)

小结:由字典和Series来创建DataFrame,他们的key都变为DataFrame的列属性

2.1.3 从excel表格和csv文件读取创建

1
2
3
4
5
6
7
8
9
10
#从excel表格中读取多张表格,然后从多张表格中筛选需要的数据
import pandas as pd
xsls=pd.read_excel(r'C:\Users\chend\Documents\Tencent Files\2570808391\FileRecv\学生导师双选结果.xlsx',header=0,sheet_name=None)#
table=list(xsls.values())
for i in range(len(table)):
print(type(table[i]))
df=table[i]
print(df.loc[df['拟定导师']=='李伟'])

#读取的excel文件的数据类型为 <class 'collections.OrderedDict'>(即xsls为OrderedDict类型,其中的多张表格类型为DataFrame,为了方便处理,我们先要将xsls转换为list类型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#从CSV文件中读数据,直接用python的open来读文件,用pd.read_csv()来读取可能会出现编码错误。
import pandas as pd
with open(r'C:\Users\chend\Documents\Tencent Files\2570808391\FileRecv\爱奇艺视频数据.csv') as f:
txt=f.readlines()
print(type(txt)) #class list
pd.set_option('display.max_colwidth',500)
df=pd.DataFrame(txt)
print(df)

#利用df.to_css()写入csv文件和pd.read_csv()读取csv文件
data=[[1,2,3],[4,5,6],[7,8,9]]
df=pd.DataFrame(data,index=list('ABC'),columns=list('abc'))
df.to_csv('data.cvs')
df=pd.read_csv('data.cvs')
print(df.values,type(df.values))
crr=df.values
print(crr)
drr=crr[:,1:]
print(drr)
df1=pd.DataFrame(drr,index=list('ABC'),columns=list('abc'))
print(df1)

2.2 DataFrame的选择

2.2.1 按标签选择

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pandas as pd
import numpy as np
dates=pd.date_range('20191012',periods=6)
df=pd.DataFrame(np.random.rand(6,4),index=dates,columns=['col1','col2','col3','col4'])
print(df)
#按标签选择
print('按标签选择')
#loc[,] 参数可以为单个标签,多个标签,切片,布尔值

#通过行索引选择行
print(df.loc['20191012'])

#通过列索引选择列
print(df.loc[:,['col1','col2']])

#通过多个标签来选择
print(df.loc[pd.date_range('20191012',periods=2)])
print('-------------------')
print(df.loc[pd.date_range('20191012',periods=2),['col3','col4']])

#通过切片来选择
print(df.loc['20191012':'20191014',['col3','col4']])

#通过布尔列表选择
print(df.loc[[False,True]])

#通过布尔值选择
print(df.loc[df['col4']>0.5000]) #这种选择很重要

#通过lambda函数选择
print(df.loc[lambda df: df['col4']>0.500])

#通过布尔值和多索引来选择
print(df.loc[df['col4']>0.40,['col3','col4']])

df1 = pd.DataFrame({'a': [0, 1, 2, 3, 4, 5, 6, 7],
'b': [5, 5, 2, 2, 5, 5, 2, 2],
'c': [0, 0, 0, 0, 0, 0, 0, 0]})
mask = (df1['a'] < 7) & (df1['b'] == 2)
print(df1.loc[mask,'c'].iloc[0:3])

pandas.DataFrame.loc[]功能强大,本质就是先对df的行依次遍历,返回True或者False,留下True,最后再选择要显示的列

2.2.2 按位置选择

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pandas as pd
import numpy as np
dates=pd.date_range('20191012',periods=6)
df=pd.DataFrame(np.random.rand(6,4),index=dates,columns=['col1','col2','col3','col4'])
print(df)
print('按位置选择')

#单个位置
print(df.iloc[0])
print(df.iloc[1,1])

#切片
print(df.iloc[0:2])
print(df.iloc[0:2,0:2])

#多个位置列表
print(df.iloc[[0,1,2]])
print(df.iloc[[0,2,2],[0,1,2]])

#按布尔值选择
print(df.iloc[[True,False,True,False,True],[1,2]])

#通过lambda函数选择
print(df.iloc[lambda x: x.index=='20191014'])

注意:pandas.DataFrame.iloc[]不支持df.iloc[df[‘col4’]>0.4]这样的形式

2.2.3 按布尔选择

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import numpy as np
import pandas as pd
dic={'A':[1,2,3,4,5,6],
'B':[2,3,4,5,6,7],
'C':[3,4,5,6,7,8]}
df1=pd.DataFrame(dic,index=list('abcdef'))
print(df1)
df1['D']=['one','two','three','four','five','six']
print('按布尔索引')
print(df1)
print('====================')
print(df1[df1['C']>5])
print(df1[df1['D'].isin(['one','two'])])
print('---------------')
print(df1[df1['C'].isin([3,4])])

2.3 DataFrame的赋值(增加)

2.3.1 DataFrame按行按列增加

1
2
3
4
5
6
7
8
9
10
11
12
13
import numpy as np
import pandas as pd
dic={'A':[1,2,3,4,5,6],
'B':[2,3,4,5,6,7],
'C':[3,4,5,6,7,8]}
df1=pd.DataFrame(dic,index=list('abcdef'))
print(df1)
#按行增加
df1.loc['g']=[7,8,9]
print(df1)

#按列增加
df['D']=list('listlil')

2.3.2 两个或者多个DataFrame的合并

1
2
3
4
5
6
7
#pd.concat()
import pandas as pd
import numpy as np
df=pd.DataFrame(np.random.randn(10,4))
df1=pd.DataFrame(np.random.randn(2,4))
df2=pd.DataFrame(np.random.randn(2,4))
pf=pd.concat([df,df1,df2],ignore_index=True)

2.3.3 DataFrame的追加

1
2
3
4
5
6
7
8
9
10
11
12
13
#pd.append()
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
s=df.iloc[3]
pf=df.append(s,ignore_index=True)
print(df)
print(pf)

##一次性append多行
s1=df.iloc[3:]
pf=df.append(s1,ignore_index=True)
print(pf)

2.3.4 SQL凤格的DataFrame合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#pd.merge() 
import pandas as pd
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2],'olval':[100,101]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

pp=pd.merge(left,right,on='key',how='outer') #相当于数据库的外连接
pp1=pd.merge(left,right,on='key',how='inner') # 两个key的交集
pp2=pd.merge(left,right,on='key',how='left')#相当于数据库的左连接
pp3=pd.merge(left,right,on='key',how='right')#相当于数据库的右连接

print(pp)
print(pp1)
print(pp2)
print(pp3)

2.4 DataFrame的修改(更新)

1
2
3
4
5
6
7
8
9
10
11
12
13
#通过标签修改
dic={'A':[1,2,3,4,5,6],
'B':[2,3,4,5,6,7],
'C':[3,4,5,6,7,8]}
df1=pd.DataFrame(dic,index=list('abcdef'))
df1.loc['a']=[100,101,102,103]
print(df1)
df1.loc['a','A']=1000
print(df1)

#通过位置修改
df1.iloc[0]=[200,201,202,np.nan]
print(df1)

2.5 DataFrame的删除

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#del
#drop()

dic={'A':[1,2,3,4,5,6],
'B':[2,3,4,5,6,7],
'C':[3,4,5,6,7,8]}
df1=pd.DataFrame(dic,index=list('abcdef'))
df1['D']=list('listlil')
#删除 整列
del df1['D']
df1=df1.drop('C',axis=1)
print(df1)
#删除 整行
df1=df1.drop(['g'],axis=0)
print(df1)

#多行多列删除
df1=df1.drop(['C','D'],axis=1) #多列删除
df1=df1.drop(['a','b'],axis=0) #多行删除

2.6 空值nan的处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
dic={'A':[1,2,3,4,5,6],
'B':[2,3,4,5,6,7],
'C':[3,4,5,6,7,8]}
df1=pd.DataFrame(dic,index=list('abcdef'))

df1['D']=['one','two','three',np.nan,'five',np.nan]
print(df1)
#删除所有的nan所在的行
df1=df1.dropna(how='any',axis=0)
print(df1)
#删除所有nans所在的列
df1=df1.dropna(how='any',axis=1)
print(df1)

#将nan填充为默认值
df1=df1.fillna('ok')
print(df1)

第3章 pandas常见的统计量

3.1 groupby

1
2
3
4
5
6
7
8
9
10
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
print(df)
print(df.groupby('A').sum())
print(df.groupby('B').sum())
print(df.groupby(['A','B']).sum())

3.2 描述性统计

1
2
3
4
5
mean()
std()
median()
mode() 众数
idxmax()/idxmin() 最大/小值索引

3.3 排序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
df = pd.DataFrame({
'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
print(df)


#按列里的值排序
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
'two': [1, 3, 2, 4],
'three': [5, 4, 3, 2]})
print(df1)
print(df1.sort_values(by='two'))

print(df1.sort_values(by=['one','two']))

其他排序:#按索引标签排序 #按两种方式混合排序

```

参考资料

https://www.pypandas.cn/docs/getting_started/dsintro.html#%E6%95%B0%E6%8D%AE%E5%B8%A7
https://www.pypandas.cn/docs/getting_started/basics.html#%E4%BA%8C%E8%BF%9B%E5%88%B6%E6%93%8D%E4%BD%9C