pandas 是python中的一个数据分析库，填补了python在数据分析中的不足，同 numpy 、matplotlib一起为python的小规模数据分析提供强大的支持。

** 数据分析的一般步骤**

数据整理和清洗
数据分析与建模
数据可视化和制表

pandas 在数据的处理方面表现卓越.

数据结构

series

series （系列），在pandas中可以表示所有的由一维数据组成的数据结构。

import pandas as pd

l = [0,1,7,9,np.NAN,None,1024,512]
# ⽆论是numpy中的NAN还是Python中的None在pandas中都以缺失数据NaN对待
s1 = pd.Series(data = l) # pandas⾃动添加索引
s2 = pd.Series(data = l,index = list('abcdefhi'),dtype='float32') # 指定⾏索引
# 传⼊字典创建， key⾏索引
s3 = pd.Series(data = {'a':99,'b':137,'c':149},name = 'Python_score')
display(s1,s2,s3)

DataFrame

与series 不同，代表所有的二维数组结构的数据结构。

import numpy as np
import pandas as pd
# index 作为⾏索引，字典中的key作为列索引，创建了3*3的DataFrame表格⼆维数组
df1 = pd.DataFrame(data = {'Python':[99,107,122],'Math':[111,137,88],'En':
[68,108,43]},# key作为列索引
index = ['张三','李四','Michael']) # ⾏索引
df2 = pd.DataFrame(data = np.random.randint(0,151,size = (5,3)),
index = ['Danial','Brandon','softpo','Ella','Cindy'],# ⾏索引
columns=['Python','Math','En'])# 列索引

常用的函数和方法：

# 查看其属性、概览和统计信息
df.head(10) # 显示头部10⾏，默认5个
df.tail(10) # 显示末尾10⾏，默认5个
df.shape # 查看形状，⾏数和列数
df.dtypes # 查看数据类型
df.index # ⾏索引
df.columns # 列索引
df.values # 对象值，⼆维ndarray数组
df.describe() # 查看数值型列的汇总统计,计数、平均值、标准差、最⼩值、四分位数、最⼤值
df.info() # 查看列索引、数据类型、⾮空计数和内存信

数据加载与数据导出

csv

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,50,size = [50,5]), # 薪资情况
columns=['IT','化⼯','⽣物','教师','⼠兵'])
# 保存到当前路径下，⽂件命名是： salary.csv。 csv逗号分割值⽂件格式
df.to_csv('./salary.csv',
sep = ';', # ⽂本分隔符，默认是逗号
header = True,# 是否保存列索引
index = True) # 是否保存⾏索引，保存⾏索引，⽂件被加载时，默认⾏索引会作为⼀
列
# 加载
pd.read_csv('./salary.csv',
sep = ';',# 默认是逗号
header = [0],#指定列索引
index_col=0) # 指定⾏索引
pd.read_table('./salary.csv', # 和read_csv类似，读取限定分隔符的⽂本⽂件
sep = ';',
header = [0],#指定列索引
index_col=1) # 指定⾏索引,IT作为⾏索引

excel

import numpy as np
import pandas as pd
df1 = pd.DataFrame(data = np.random.randint(0,50,size = [50,5]), # 薪资情况
columns=['IT','化⼯','⽣物','教师','⼠兵'])
df2 = pd.DataFrame(data = np.random.randint(0,50,size = [150,3]),# 计算机科⽬
的考试成绩
columns=['Python','Tensorflow','Keras'])
# 保存到当前路径下，⽂件命名是： salary.xls
df1.to_excel('./salary.xls',
sheet_name = 'salary',# Excel中⼯作表的名字
header = True,# 是否保存列索引
index = False) # 是否保存⾏索引，保存⾏索引
pd.read_excel('./salary.xls',
sheet_name=0,# 读取哪⼀个Excel中⼯作表，默认第⼀个
header = 0,# 使⽤第⼀⾏数据作为列索引
names = list('ABCDE'),# 替换⾏索引
index_col=1)# 指定⾏索引， B作为⾏索引
# ⼀个Excel⽂件中保存多个⼯作表
with pd.ExcelWriter('./data.xlsx') as writer:
df1.to_excel(writer,sheet_name='salary',index = False)
df2.to_excel(writer,sheet_name='score',index = False)
pd.read_excel('./data.xlsx',
sheet_name='salary') # 读取Excel中指定名字的⼯作表

数据获取

import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [150,3]),# 计算机科⽬
的考试成绩
columns=['Python','Tensorflow','Keras'])
df['Python'] # 获取单列， Series
df.Python # 获取单列， Series
df[['Python','Keras']] # 获取多列， DataFrame
df[3:15] # ⾏切⽚

按标签获取

import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科⽬的考试成绩
index = list('ABCDEFGHIJ'),# ⾏标签
columns=['Python','Tensorflow','Keras'])
df.loc[['A','C','D','F']] # 选取指定⾏标签数据。
df.loc['A':'E',['Python','Keras']] # 根据⾏标签切⽚，选取指定列标签的数据
df.loc[:,['Keras','Tensorflow']] # :默认保留所有⾏
df.loc['E'::2,'Python':'Tensorflow'] # ⾏切⽚从标签E开始每2个中取⼀个，列标签进⾏切⽚
df.loc['A','Python'] # 选取标量值

位置选择

df.iloc[4] # ⽤整数位置选择。
df.iloc[2:8,0:2] # ⽤整数切⽚，类似NumPy
df.iloc[[1,3,5],[0,2,1]] # 整数列表按位置切⽚
df.iloc[1:3,:] # ⾏切⽚
df.iloc[:,:2] # 列切⽚
df.iloc[0,2] # 选取标量值

条件筛选

import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科⽬的考试成绩
index = list('ABCDEFGHIJ'),# ⾏标签，⽤户
columns=['Python','Tensorflow','Keras']) # 考试科⽬
cond1 = df.Python > 100 # 判断Python分数是否⼤于100，返回值是boolean类型的Series
df[cond1] # 返回Python分数⼤于100分的⽤户所有考试科⽬数据
cond2 = (df.Python > 50) & (df['Keras'] > 50) # &与运算
df[cond2] # 返回Python和Keras同时⼤于50分的⽤户的所有考试科⽬数据
df[df > 50]# 选择DataFrame中满⾜条件的值，如果满⾜返回值，不然返回空数据NaN
df[df.index.isin(['A','C','F'])] # isin判断是否在数组中，返回也是boolean类型值

赋值操作

import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科⽬的考试成绩
index = list('ABCDEFGHIJ'),# ⾏标签，⽤户
columns=['Python','Tensorflow','Keras']) # 考试科⽬
print("二维数据：",df)
s = pd.Series(data = np.random.randint(0,150,size =
9),index=list('BCDEFGHIJ'),name = 'PyTorch')
print("一位数据：",s)
df['PyTorch'] = s # 增加⼀列， DataFrame⾏索引⾃动对⻬
df.loc['A','Python'] = 256 # 按标签赋值
df.iloc[3,2] = 512 # 按位置赋值
df.loc[:,'Python'] = np.array([128]*10) # 按NumPy数组进⾏赋值
df[df >= 128] = -df # 按照where条件进⾏赋值，⼤于等于128变成原来的负数，否则不变df

二维数据：
    Python  Tensorflow  Keras
A      99          96     55
B      45          44    142
C     105          82     49
D     128         113    132
E      76           1     35
F     137          19      7
G      81          77    117
H     136          48    111
I      15          60    109
J      69         126     50
一维数据
 B    135
C     79
D     65
E     81
F    100
G    139
H     27
I     15
J    121
Name: PyTorch, dtype: int32
结果1    Python  Tensorflow  Keras  PyTorch
A     128          96     55      NaN
B     128          44    142    135.0
C     128          82     49     79.0
D     128         113    512     65.0
E     128           1     35     81.0
F     128          19      7    100.0
G     128          77    117    139.0
H     128          48    111     27.0
I     128          60    109     15.0
J     128         126     50    121.0
结果2    Python  Tensorflow  Keras  PyTorch
A    -128          96     55      NaN
B    -128          44   -142   -135.0
C    -128          82     49     79.0
D    -128         113   -512     65.0
E    -128           1     35     81.0
F    -128          19      7    100.0
G    -128          77    117   -139.0
H    -128          48    111     27.0
I    -128          60    109     15.0
J    -128         126     50    121.0

数据集成

数据窜连


import pandas as pd
import numpy as np
df1 = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科⽬的考试成绩
    index = list('ABCDEFGHIJ'),# ⾏标签，⽤户
    columns=['Python','Tensorflow','Keras']) # 考试科⽬
print('df1\n',df1)
df2 = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科⽬的考试成绩
    index = list('KLMNOPQRST'),# ⾏标签，⽤户
    columns=['Python','Tensorflow','Keras']) # 考试科⽬
print('df2\n',df2)
df3 = pd.DataFrame(data = np.random.randint(0,150,size = (10,2)),index = list('ABCDEFGHIJ'),columns=['PyTorch','Paddle'])
print('df3\n',df3)
r1 = pd.concat([df1,df2],axis = 0) # df1和df2⾏串联， df2的⾏追加df2⾏后⾯   x轴串联
print('结果1\n',r1)
#df1.append(df2) # 在df1后⾯追加df2 新版本已弃用该方法，推进 concat
r2 = pd.concat([df1,df3],axis = 1) # df1和df2列串联， df2的列追加到df1列后⾯ y轴串联
print('r2\n',r2)

df1
    Python  Tensorflow  Keras
A       6         137     75
B      73          39    103
C      53         118     57
D       8           5      3
E     109          65    102
F      63         136     60
G      70         111     96
H      81         144     50
I       5          64     82
J       6          36     67
df2
    Python  Tensorflow  Keras
K     120         146    141
L     139         104     67
M     123          54     18
N      10          86      3
O      40           4    115
P      95          56     93
Q       4          35      0
R     144          70    128
S      62          21      8
T      48         108    126
df3
    PyTorch  Paddle
A       31      71
B      117      31
C       34     144
D        3      56
E       12      18
F       33      36
G       91     139
H      140     109
I       49      95
J      125     142
结果1
    Python  Tensorflow  Keras
A       6         137     75
B      73          39    103
C      53         118     57
D       8           5      3
E     109          65    102
F      63         136     60
G      70         111     96
H      81         144     50
I       5          64     82
J       6          36     67
K     120         146    141
L     139         104     67
M     123          54     18
N      10          86      3
O      40           4    115
P      95          56     93
Q       4          35      0
R     144          70    128
S      62          21      8
T      48         108    126
r2
    Python  Tensorflow  Keras  PyTorch  Paddle
A       6         137     75       31      71
B      73          39    103      117      31
C      53         118     57       34     144
D       8           5      3        3      56
E     109          65    102       12      18
F      63         136     60       33      36
G      70         111     96       91     139
H      81         144     50      140     109
I       5          64     82       49      95
J       6          36     67      125     142

数据插入

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,151,size = (10,3)),
index = list('ABCDEFGHIJ'),
columns = ['Python','Keras','Tensorflow'])
df.insert(loc = 1,column='Pytorch',value=1024) # 插⼊列df
print(df)
# 对⾏的操作，使⽤追加append，默认在最后⾯，⽆法指定位置
# 如果想要在指定位置插⼊⾏：切割-添加-合并

sql 式的拼接

import pandas as pd
import numpy as np

# 表⼀中记录的是name和体重信息
df1 = pd.DataFrame(data = {'name':['softpo','Daniel','Brandon','Ella'],'weight':[70,55,75,65]})

# 表⼆中记录的是name和身⾼信息
df2 = pd.DataFrame(data = {'name':['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]})

df3 = pd.DataFrame(data = {'名字':['softpo','Daniel','Brandon','Cindy'],'height':[172,170,170,166]})

# 根据共同的name将俩表的数据，进⾏合并
r1 = pd.merge(df1,df2,
        how = 'inner',# 内合并代表两对象交集
        on = 'name')

print('r1\n',r1)

r2 = pd.merge(df1,df3,
        how = 'outer',# 全外连接，两对象并集
        left_on = 'name',# 左边DataFrame使⽤列标签 name进⾏合并
        right_on = '名字')# 右边DataFrame使⽤列标签 名字进⾏合并
print('r2\n',r2)

# 创建10名学⽣的考试成绩
df4 = pd.DataFrame(data = np.random.randint(0,151,size = (10,3)),
        index = list('ABCDEFHIJK'),
        columns=['Python','Keras','Tensorflow'])
print('df4\n',df4)
# 计算每位学⽣各科平均分，转换成DataFrame
score_mean = pd.DataFrame(df4.mean(axis = 1).round(1),columns=['平均分'])
print('score_mean',score_mean)
# 将平均分和df3使⽤merge进⾏合并，它俩有共同的⾏索引
r3 = pd.merge(left = df4,right = score_mean,
        left_index=True,# 左边DataFrame使⽤⾏索引进⾏合并
        right_index=True)# 右边的DataFrame使⽤⾏索引进⾏合并
print('r3\n',r3)

r1
       name  weight  height
0   softpo      70     172
1   Daniel      55     170
2  Brandon      75     170
r2
       name  weight       名字  height
0  Brandon    75.0  Brandon   170.0
1      NaN     NaN    Cindy   166.0
2   Daniel    55.0   Daniel   170.0
3     Ella    65.0      NaN     NaN
4   softpo    70.0   softpo   172.0
df4
    Python  Keras  Tensorflow
A      56     34         136
B     116    148         118
C      44    136         123
D      89      6          39
E       5    127          91
F       3    110         123
H     133     87         118
I     150      2          27
J       5    137          34
K      59     18         104
score_mean      平均分
A   75.3
B  127.3
C  101.0
D   44.7
E   74.3
F   78.7
H  112.7
I   59.7
J   58.7
K   60.3
r3
    Python  Keras  Tensorflow    平均分
A      56     34         136   75.3
B     116    148         118  127.3
C      44    136         123  101.0
D      89      6          39   44.7
E       5    127          91   74.3
F       3    110         123   78.7
H     133     87         118  112.7
I     150      2          27   59.7
J       5    137          34   58.7
K      59     18         104   60.3

数据清洗

import numpy as np
import pandas as pd

df = pd.DataFrame(data = {'color':['red','blue','red','green','blue',None,'red'],'price':[10,20,10,15,20,0,np.NaN]})
display(df)
# 1、重复数据过滤
df.duplicated() # 判断是否存在重复数据
df.drop_duplicates() # 删除重复数据
# 2、空数据过滤
df.isnull() # 判断是否存在空数据，存在返回True，否则返回False
df.dropna(how = 'any') # 删除空数据
df.fillna(value=1111) # 填充空数据
# 3、指定⾏或者列过滤
del df['color'] # 直接删除某列
df.drop(labels = ['price'],axis = 1)# 删除指定列
df.drop(labels = [0,1,5],axis = 0) # 删除指定⾏
# 4、函数filter使⽤
df = pd.DataFrame(np.array(([3,7,1], [2, 8, 256])),
    index=['dog', 'cat'],
    columns=['China', 'America', 'France'])
df.filter(items=['China', 'France'])
# 根据正则表达式删选列标签
df.filter(regex='a$', axis=1)
# 选择⾏中包含og
df.filter(like='og', axis=0)
# 5、异常值过滤
df2 = pd.DataFrame(data = np.random.randn(10000,3)) # 正态分布数据
# 3σ过滤异常值， σ即是标准差
cond = (df2 > 3*df2.std()).any(axis = 1)
index = df2[cond].index # 不满⾜条件的⾏索引
df2.drop(labels=index,axis = 0) # 根据⾏索引，进⾏数据删除

数据转化

轴和元素替换

import numpy as np
import pandas as pd

df = pd.DataFrame(data = np.random.randint(0,10,size = (10,3)),
index = list('ABCDEFHIJK'),
columns=['Python','Tensorflow','Keras'])
df.iloc[4,2] = None # 空数据
display(df)
#1、重命名轴索引 注意结果是返回值
r1 = df.rename(index = {'A':'AA','B':'BB'},columns = {'Python':'⼈⼯智能'})
display(r1)
# 2、替换值  注意结果是返回值
display(df.replace(3,1024)) #将3替换为1024  
df.replace([0,7],2048) # 将0和7替换为2048
df.replace({0:512,np.nan:998}) # 根据字典键值对进⾏替换
df.replace({'Python':2},-1024) # 将Python这⼀列中等于2的，替换为-1024

map Series 元素操作

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,10,size = (10,3)),
index = list('ABCDEFHIJK'),
columns=['Python','Tensorflow','Keras'])
df.iloc[4,2] = None # 空数据
display(df)
# 1、 map批量元素改变， Series专有
r1 = df['Keras'].map({1:'Hello',5:'World',7:'AI'}) # 字典映射
print(r1)
df['Python'].map(lambda x:True if x >=5 else False) # 隐式函数映射
def convert(x): # 显示函数映射
    if x%3 == 0:
        return True
    elif x%3 == 1:
        return False
df['Tensorflow'].map(convert)

apply、map 对数据的操作

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,10,size = (10,3)),
    index = list('ABCDEFHIJK'),
    columns=['Python','Tensorflow','Keras'])

df.iloc[4,2] = None # 空数据

display(df)

# 1、 apply 应⽤⽅法数据转换，通⽤
# Series，其中x是Series中元素
print(df['Keras'].apply(lambda x:True if x >5 else False))
# DataFrame，其中的x是DataFrame中列或者⾏，是Series

print(df.apply(lambda x : x.median(),axis = 0)) # 列的中位数


def convert(x): # ⾃定义⽅法
    return (x.mean().round(1),x.count())
    
print(df.apply(convert,axis = 1)) # ⾏平均值，计数

# 2、 applymap DataFrame专有
print(df.map(lambda x : x + 100)) # 计算DataFrame中每个元素

Python	Tensorflow	Keras
A	9	0	3.0
B	8	5	4.0
C	6	5	5.0
D	9	6	6.0
E	1	7	NaN
F	3	3	8.0
H	1	0	6.0
I	6	1	5.0
J	0	2	9.0
K	0	2	7.0

A    False
B    False
C    False
D     True
E    False
F     True
H     True
I    False
J     True
K     True
Name: Keras, dtype: bool
Python        4.5
Tensorflow    2.5
Keras         6.0
dtype: float64
A    (4.0, 3)
B    (5.7, 3)
C    (5.3, 3)
D    (7.0, 3)
E    (4.0, 2)
F    (4.7, 3)
H    (2.3, 3)
I    (4.0, 3)
J    (3.7, 3)
K    (3.0, 3)
dtype: object
   Python  Tensorflow  Keras
A     109         100  103.0
B     108         105  104.0
C     106         105  105.0
D     109         106  106.0
E     101         107    NaN
F     103         103  108.0
H     101         100  106.0
I     106         101  105.0
J     100         102  109.0
K     100         102  107.0

transform

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,10,size = (10,3)),
    index = list('ABCDEFHIJK'),
    columns=['Python','Tensorflow','Keras'])

df.iloc[4,2] = None # 空数据
display(df)
# 1、⼀列执⾏多项计算
display(df['Python'].transform([np.sqrt,np.exp])) # Series处理

def convert(x):
    if x.mean() > 5:
        x *= 10
    else:
        x *= -10
    return x
    
# 2、多列执行不同计算，使用适当的转换函数
display(df.transform({
    'Python': convert,
    'Tensorflow': lambda x: x*10,  # 保持原值
    'Keras': lambda x: x.fillna(x.mean())  # 填充空值后返回
}))

Python	Tensorflow	Keras
A	4	7	1.0
B	4	7	8.0
C	6	8	5.0
D	6	0	0.0
E	7	4	NaN
F	8	7	5.0
H	9	4	7.0
I	6	1	2.0
J	3	6	8.0
K	4	1	3.0

	sqrt	exp
A	2.000000	54.598150
B	2.000000	54.598150
C	2.449490	403.428793
D	2.449490	403.428793
E	2.645751	1096.633158
F	2.828427	2980.957987
H	3.000000	8103.083928
I	2.449490	403.428793
J	1.732051	20.085537
K	2.000000	54.598150

	Python	Tensorflow	Keras
A	40	70	1.000000
B	40	70	8.000000
C	60	80	5.000000
D	60	0	0.000000
E	70	40	4.333333
F	80	70	5.000000
H	90	40	7.000000
I	60	10	2.000000
J	30	60	8.000000
K	40	10	3.000000

随机、重拍、抽样、独热编码

# 创建一个更复杂的数据框
df_complex = pd.DataFrame({
    'color': ['red', 'blue', 'red', 'green'],
    'size': ['S', 'M', 'L', 'M']
})

# 对所有列进行独热编码
print("原始数据：")
print(df_complex)
print("\n独热编码后：")
print(pd.get_dummies(df_complex))

Python	Tensorflow	Keras
A	8	6	7.0
B	0	1	1.0
C	5	6	9.0
D	8	8	6.0
E	5	6	NaN
F	6	9	2.0
H	5	3	0.0
I	4	3	7.0
J	2	2	1.0
K	0	0	4.0

	sqrt	exp
A	2.828427	2980.957987
B	0.000000	1.000000
C	2.236068	148.413159
D	2.828427	2980.957987
E	2.236068	148.413159
F	2.449490	403.428793
H	2.236068	148.413159
I	2.000000	54.598150
J	1.414214	7.389056
K	0.000000	1.000000

	Python	Tensorflow	Keras
A	-80	60	7.000000
B	0	10	1.000000
C	-50	60	9.000000
D	-80	80	6.000000
E	-50	60	4.111111
F	-60	90	2.000000
H	-50	30	0.000000
I	-40	30	7.000000
J	-20	20	1.000000
K	0	0	4.000000

# 创建一个更复杂的数据框
df_complex = pd.DataFrame({
    'color': ['red', 'blue', 'red', 'green'],
    'size': ['S', 'M', 'L', 'M']
})

# 对所有列进行独热编码
print("原始数据：")
print(df_complex)
print("\n独热编码后：")
print(pd.get_dummies(df_complex))

原始数据：
   color size
0    red    S
1   blue    M
2    red    L
3  green    M

独热编码后：
   color_blue  color_green  color_red  size_L  size_M  size_S
0       False        False       True   False   False    True
1        True        False      False   False    True   False
2       False        False       True    True   False   False
3       False         True      False   False    True   False

数据重塑

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,100,size = (10,3)),
    index = list('ABCDEFHIJK'),
    columns=['Python','Tensorflow','Keras'])
display(df)
 
display(df.T)# 坐标轴横置
df2 = pd.DataFrame(data = np.random.randint(0,100,size = (20,3)),
    index = pd.MultiIndex.from_product([list('ABCDEFHIJK'),['期中','期末']]),#多层索引
    columns=['Python','Tensorflow','Keras'])
display(df2)
display(df2.unstack(level = -1)) # ⾏旋转成列， level指定哪⼀层，进⾏变换
df2.stack() # 列旋转成⾏
df2.stack().unstack(level = 1) # ⾏列互换
# 多层索引DataFrame数学计算
df2.mean() # 各学科平均分
df2.groupby(level=0).mean() # 各学科，每个⼈期中期末平均分
df2.groupby(level=1).mean() # 各学科，期中期末所有⼈平均分

Python	Tensorflow	Keras
A	35	37	16
B	83	25	20
C	90	96	0
D	65	93	10
E	14	9	93
F	10	32	87
H	20	72	18
I	73	20	0
J	36	4	67
K	77	64	5

	A	B	C	D	E	F	H	I	J	K
Python	35	83	90	65	14	10	20	73	36	77
Tensorflow	37	25	96	93	9	32	72	20	4	64
Keras	16	20	0	10	93	87	18	0	67	5

		Python	Tensorflow	Keras
A	期中	54	2	3
期末	84	6	31
B	期中	3	45	42
期末	34	18	63
C	期中	62	88	20
期末	1	12	13
D	期中	3	69	77
期末	0	52	8
E	期中	8	99	15
期末	86	79	96
F	期中	89	53	74
期末	62	75	93
H	期中	86	50	15
期末	98	29	21
I	期中	19	26	91
期末	85	5	52
J	期中	46	89	11
期末	87	72	62
K	期中	85	46	46
期末	6	86	14

	Python	Tensorflow	Keras
	期中	期末	期中	期末	期中	期末
A	54	84	2	6	3	31
B	3	34	45	18	42	63
C	62	1	88	12	20	13
D	3	0	69	52	77	8
E	8	86	99	79	15	96
F	89	62	53	75	74	93
H	86	98	50	29	15	21
I	19	85	26	5	91	52
J	46	87	89	72	11	62
K	85	6	46	86	46	14

[6]:

	Python	Tensorflow	Keras
期中	45.5	56.7	39.4
期末	54.3	43.4	45.3

数学&统计

统计

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,100,size = (20,3)),
index = list('ABCDEFHIJKLMNOPQRSTU'),
columns=['Python','Tensorflow','Keras'])
# 1、简单统计指标
df.count() # ⾮NA值的数量
df.max(axis = 0) #轴0最⼤值，即每⼀列最⼤值
df.min() #默认计算轴0最⼩值
df.median() # 中位数
df.sum() # 求和
df.mean(axis = 1) #轴1平均值，即每⼀⾏的平均值
df.quantile(q = [0.2,0.4,0.8]) # 分位数
df.describe() # 查看数值型列的汇总统计,计数、平均值、标准差、最⼩值、四分位数、最⼤值


# 2、索引位置
df['Python'].argmin() # 计算最⼩值位置
df['Keras'].argmax() # 最⼤值位置
df.idxmax() # 最⼤值索引标签
df.idxmin() # 最⼩值索引标签

# 3、更多统计指标
df['Python'].value_counts() # 统计元素出现次数
df['Keras'].unique() # 去重
df.cumsum() # 累加
df.cumprod() # 累乘
df.std() # 标准差
df.var() # ⽅差
df.cummin() # 累计最⼩值
df.cummax() # 累计最⼤值
df.diff() # 计算差分
df.pct_change() # 计算百分⽐变化

# 4、⾼级统计指标
df.cov() # 属性的协⽅差
df['Python'].cov(df['Keras']) # Python和Keras的协⽅差
df.corr() # 所有属性相关性系数
df.corrwith(df['Tensorflow']) # 单⼀属性相关性系数

Python	Tensorflow	Keras
count	20.00000	20.000000	20.000000
mean	45.35000	51.350000	56.300000
std	31.77764	25.359053	33.010525
min	1.00000	15.000000	4.000000
25%	14.25000	25.500000	27.750000
50%	48.00000	53.500000	66.000000
75%	67.50000	63.750000	81.000000
max	99.00000	97.000000	98.000000

数据排序

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,30,size = (30,3)),
index = list('qwertyuioijhgfcasdcvbnerfghjcf'),
columns = ['Python','Keras','Pytorch'])
# 1、索引列名排序
df.sort_index(axis = 0,ascending=True) # 按索引排序，降序
df.sort_index(axis = 1,ascending=False) #按列名排序，升序
# 2、属性值排序
df.sort_values(by = ['Python']) #按Python属性值排序
df.sort_values(by = ['Python','Keras'])#先按Python，再按Keras排序
# 3、返回属性n⼤或者n⼩的值
df.nlargest(10,columns='Keras') # 根据属性Keras排序,返回最⼤10个数据
df.nsmallest(5,columns='Python') # 根据属性Python排序，返回最⼩5个数据

分箱操作

分箱操作就是将连续数据转换为分类对应物的过程。⽐如将连续的身⾼数据划分为：矮中⾼。分箱操作分为等距分箱和等频分箱。

分箱操作也叫⾯元划分或者离散化。

import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,150,size = (100,3)),
columns=['Python','Tensorflow','Keras'])
# 1、等宽分箱
pd.cut(df.Python,bins = 3)
# 指定宽度分箱
pd.cut(df.Keras,#分箱数据
bins = [0,60,90,120,150],#分箱断点
right = False,# 左闭右开
labels=['不及格','中等','良好','优秀'])# 分箱后分类
# 2、等频分箱
pd.qcut(df.Python,q = 4,# 4等分
labels=['差','中','良','优']) # 分箱后分类

分组与聚合

import numpy as np
import numpy as np
import pandas as pd
df = pd.DataFrame(data = np.random.randint(0,150,size = (100,3)),
    columns=['Python','Tensorflow','Keras'])
# 1、等宽分箱
pd.cut(df.Python,bins = 3)
# 指定宽度分箱
pd.cut(df.Keras,#分箱数据
    bins = [0,60,90,120,150],#分箱断点
    right = False,# 左闭右开
    labels=['不及格','中等','良好','优秀'])# 分箱后分类
# 2、等频分箱
pd.qcut(df.Python,q = 4,# 4等分
    labels=['差','中','良','优']) # 分箱后分类import pandas as pd
# 准备数据
df = pd.DataFrame(data = {'sex':np.random.randint(0,2,size = 300), # 0男， 1⼥
    'class':np.random.randint(1,9,size = 300),#1~8⼋个班
    'Python':np.random.randint(0,151,size = 300),#Python成绩
    'Keras':np.random.randint(0,151,size =300),#Keras成绩
    'Tensorflow':np.random.randint(0,151,size=300),
    'Java':np.random.randint(0,151,size = 300),
    'C++':np.random.randint(0,151,size = 300)})
df['sex'] = df['sex'].map({0:'男',1:'⼥'}) # 将0， 1映射成男⼥
# 1、分组->可迭代对象
# 1.1 先分组再获取数据
g = df.groupby(by = 'sex')[['Python','Java']] # 单分组
for name,data in g:
    print('组名： ',name)
    print('数据： ',data)

    
df.groupby(by = ['class','sex'])[['Python']] # 多分组
# 1.2 对⼀列值进⾏分组
df['Python'].groupby(df['class']) # 单分组
df['Keras'].groupby([df['class'],df['sex']]) # 多分组
# 1.3 按数据类型分组
df.groupby(df.dtypes,axis = 1)
# 1.4 通过字典进⾏分组
m ={'sex':'category','class':'category','Python':'IT','Keras':'IT','Tensorflow':'IT','Java':'IT','C++':'IT'}
for name,data in df.groupby(m,axis = 1):
    print('组名',name)
    print('数据',data)
    
    
# 2、分组直接调⽤函数进⾏聚合
# 按照性别分组，其他列均值聚合
df.groupby(by = 'sex').mean().round(1) # 保留1位⼩数
# 按照班级和性别进⾏分组， Python、 Keras的最⼤值聚合
df.groupby(by = ['class','sex'])[['Python','Keras']].max()
# 按照班级和性别进⾏分组，计数聚合。统计每个班，男⼥⼈数
df.groupby(by = ['class','sex']).size()
# 基本描述性统计聚合
df.groupby(by = ['class','sex']).describe()

分组聚合函数 apply 、 transform

# 3、分组后调⽤apply， transform封装单⼀函数计算
# 返回分组结果
df.groupby(by = ['class','sex'])[['Python','Keras']].apply(np.mean).round(1)
def normalization(x):
    return (x - x.min())/(x.max() - x.min()) # 最⼤值最⼩值归⼀化
# 返回全数据，返回DataFrame.shape和原DataFrame.shape⼀样。
df.groupby(by = ['class','sex'])[['Python','Tensorflow']].transform(normalization).round(3)

agg

# 4、 agg 多中统计汇总操作
# 分组后调⽤agg应⽤多种统计汇总
df.groupby(by = ['class','sex'])[['Tensorflow','Keras']].agg([np.max,np.min,pd.Series.count])
# 分组后不同属性应⽤多种不同统计汇总
df.groupby(by = ['class','sex'])[['Python','Keras']].agg({'Python':[('最⼤值',np.max),('最⼩值',np.min)],
'Keras':[('计数',pd.Series.count),('中位数',np.median)]})

透视表

# 5、透视表
# 透视表也是⼀种分组聚合运算
def count(x):
return len(x)
df.pivot_table(values=['Python','Keras','Tensorflow'],# 要透视分组的值
index=['class','sex'], # 分组透视指标
aggfunc={'Python':[('最⼤值',np.max)], # 聚合运算
'Keras':[('最⼩值',np.min),('中位数',np.median)],
'Tensorflow':[('最⼩值',np.min),('平均值',np.mean),('计
数',count)]})

import numpy as np
import pandas as pd

# 创建示例数据
np.random.seed(42)
df = pd.DataFrame({
    'class': ['A', 'A', 'B', 'B', 'A', 'B'] * 3,
    'sex': ['M', 'F', 'M', 'F', 'M', 'F'] * 3,
    'Python': np.random.randint(60, 100, 18),
    'Keras': np.random.randint(60, 100, 18),
    'Tensorflow': np.random.randint(60, 100, 18)
})

# 定义计数函数
def count(x):
    return len(x)

# 创建透视表
pivot_result = df.pivot_table(
    values=['Python', 'Keras', 'Tensorflow'],  # 要分析的数值列
    index=['class', 'sex'],                    # 行索引
    aggfunc={                                  # 聚合函数
        'Python': [('最大值', np.max)],
        'Keras': [('最小值', np.min), ('中位数', np.median)],
        'Tensorflow': [('最小值', np.min), ('平均值', np.mean), ('计数', count)]
    }
)

print("透视表结果：")
print(pivot_result)

``` 关于警告异常问题 ```
# 更多示例
pivot_examples = df.pivot_table(
    values=['Python'],
    index=['class'],
    aggfunc={
        'Python': [
            ('最大值', 'max'),
            ('最小值', 'min'),
            ('平均值', 'mean'),
            ('中位数', 'median'),
            ('总和', 'sum'),
            ('计数', 'count'),
            ('标准差', 'std'),
            ('自定义', count)  # 自定义函数仍然可以使用函数对象
        ]
    }
)
# 自定义聚合函数
def pass_rate(x):
    return (x >= 60).mean() * 100

def score_range(x):
    return x.max() - x.min()

# 使用多种聚合方式
complex_pivot = df.pivot_table(
    values=['Python', 'Keras'],
    index=['class'],
    aggfunc={
        'Python': [
            ('及格率', pass_rate),
            ('分数范围', score_range),
            ('平均分', 'mean')
        ],
        'Keras': [
            ('最高分', 'max'),
            ('最低分', 'min'),
            ('平均分', 'mean')
        ]
    }
)


# 简化版本
simple_pivot = df.pivot_table(
    values=['Python', 'Keras', 'Tensorflow'],
    index=['class', 'sex'],
    aggfunc='mean'  # 对所有列使用相同的聚合函数
)

# 使用多个基本聚合函数
multi_agg_pivot = df.pivot_table(
    values=['Python'],
    index=['class'],
    aggfunc=['mean', 'max', 'min', 'count']  # 使用字符串列表
)

时间操作

import pandas as pd
import numpy as np

# 1. 创建时间戳和时期数据
# 时刻数据（Timestamp）
ts1 = pd.Timestamp('2020-08-24 12:00')  # 标准格式
print("时刻数据：", ts1)

# 时期数据（Period）
period1 = pd.Period('2020-08', freq='M')  # 月度周期
print("时期数据：", period1)

# 创建时间范围（date_range）- 时刻数据
date_index = pd.date_range(
    start='2020-08-24',    # 使用标准格式
    periods=5,             # 生成5个日期
    freq='ME'              # 按月频率
)
print("\n时间范围-时刻：")
print(date_index)

# 创建时间范围（period_range）- 时期数据
period_index = pd.period_range(
    start='2020-08',      # 使用年月格式
    periods=5,            # 生成5个周期
    freq='M'             # 按月频率
)
print("\n时间范围-时期：")
print(period_index)

# 创建时间序列
ts = pd.Series(
    np.random.randint(0, 10, size=5),
    index=date_index
)
print("\n时间序列：")
print(ts)

# 2. 时间格式转换
# 处理不同格式的日期字符串
dates = pd.to_datetime([
    '2020-08-24',
    '2020/08/24',
    '24/08/2020',
    '2020.08.24'
], format='mixed', dayfirst=False)
print("\n转换不同格式日期：")
print(dates)

# 转换时间戳（秒）
timestamp_s = pd.to_datetime([1598582232], unit='s')
print("\n从秒转换：")
print(timestamp_s)

# 转换时间戳（毫秒）
timestamp_ms = pd.to_datetime([1598582420401], unit='ms')
print("\n从毫秒转换：")
print(timestamp_ms)

# 时间偏移
print("\n时区调整（+8小时）：")
print(timestamp_ms + pd.DateOffset(hours=8))

print("\n日期偏移（+100天）：")
print(timestamp_ms + pd.DateOffset(days=100))

# 1. 创建特定频率的日期范围
# 工作日范围
business_days = pd.date_range(
    start='2020-08-24',
    periods=5,
    freq='B'  # 工作日
)
print("\n工作日范围：")
print(business_days)

# 小时范围
hourly = pd.date_range(
    start='2020-08-24',
    periods=24,
    freq='H'  # 每小时
)
print("\n小时范围：")
print(hourly)

# 2. 时间序列重采样
ts_hourly = pd.Series(
    np.random.randn(24),
    index=hourly
)

# 按天重采样（聚合）
daily_mean = ts_hourly.resample('D').mean()
print("\n每日平均值：")
print(daily_mean)

# 3. 时间偏移示例
now = pd.Timestamp('2020-08-24')
print("\n各种时间偏移：")
print("原始时间：", now)
print("下个月：", now + pd.offsets.MonthEnd())
print("下个季度：", now + pd.offsets.QuarterEnd())
print("下一个工作日：", now + pd.offsets.BusinessDay())

# 4. 时间序列的运算
date_series = pd.Series(
    pd.date_range('2020-08-24', periods=5, freq='D')
)
print("\n日期序列运算：")
print("原始日期：")
print(date_series)
print("\n加3天：")
print(date_series + pd.Timedelta(days=3))

常用的频率字符串：

‘D’: 日历日
‘B’: 工作日
‘W’: 周
‘M’: 月末
‘Q’: 季末
‘Y’: 年末
‘H’: 小时
‘T’ 或 ‘min’: 分钟
‘S’: 秒

注意事项：

使用 to_datetime 时，如果日期格式不统一，可以使用 format=’mixed’
处理国际时间时，注意时区转换
使用 DateOffset 进行时间计算时，可以组合多个参数
处理大量日期数据时，注意使用适当的格式和频率

这些修改后的代码应该能够正常运行，并展示了 Pandas 中时间日期处理的多种用法。

时间戳索引

import pandas as pd
import numpy as np

# 创建时间序列数据
index = pd.date_range("2020-8-24", periods=200, freq="D")
ts = pd.Series(range(len(index)), index=index)

# 1. 字符串类型索引
print("单个日期访问：")
print(ts['2020-08-30'])

print("\n日期切片：")
print(ts['2020-08-24':'2020-09-03'])

print("\n按月访问：")
print(ts['2020-08'])

print("\n按年访问：")
print(ts['2020'])

# 2. 时间戳索引
print("\n使用Timestamp访问：")
print(ts[pd.Timestamp('2020-08-30')])

print("\n使用Timestamp切片：")
print(ts[pd.Timestamp('2020-08-24'):pd.Timestamp('2020-08-30')])

print("\n使用date_range访问：")
print(ts[pd.date_range('2020-08-24', periods=10, freq='D')])

# 3. 时间戳索引属性
print("\n时间属性访问：")
print("年份：")
print(ts.index.year)

print("\n月份：")
print(ts.index.month)

print("\n日期：")
print(ts.index.day)

print("\n星期几（0-6，0是周一）：")
print(ts.index.dayofweek)

# 使用 isocalendar() 获取周信息
print("\n一年中的第几周：")
print(ts.index.isocalendar().week)

# 4. 更多实用的时间属性和方法
print("\n更多时间属性：")
print("是否为月初：")
print(ts.index.is_month_start[:5])

print("\n是否为月末：")
print(ts.index.is_month_end[:5])

print("\n是否为季度末：")
print(ts.index.is_quarter_end[:5])

print("\n一年中的第几天：")
print(ts.index.dayofyear[:5])

# 5. 时间序列的筛选
# 按条件筛选
print("\n筛选周末的数据：")
weekend_data = ts[ts.index.dayofweek.isin([5,6])]
print(weekend_data.head())

# 按时间范围筛选
print("\n筛选特定时间范围的数据：")
date_mask = (ts.index >= '2020-09-01') & (ts.index <= '2020-09-30')
september_data = ts[date_mask]
print(september_data.head())

# 6. 时间序列的运算
print("\n时间序列运算：")
# 向前移动7天
print("向前7天的数据：")
print(ts.shift(7).head())

# 计算7天移动平均
print("\n7天移动平均：")
print(ts.rolling(window=7).mean().head(10))

# 7. 重采样
print("\n重采样示例：")
# 按周重采样（取均值）
print("周平均值：")
print(ts.resample('W').mean().head())

# 按月重采样（取均值）
print("\n月平均值：")
print(ts.resample('M').mean().head())

# 8. 时间序列的基本统计
print("\n基本统计：")
# 按月份统计
monthly_stats = ts.groupby(ts.index.month).agg(['mean', 'min', 'max'])
print(monthly_stats)

# 按星期几统计
weekday_stats = ts.groupby(ts.index.dayofweek).agg(['mean', 'min', 'max'])
print("\n按星期统计：")
print(weekday_stats)

import pandas as pd
import numpy as np

# 创建时间序列数据
index = pd.date_range('2020-08-01', periods=365, freq='D')
ts = pd.Series(np.random.randint(0, 500, len(index)), index=index)

# 1. 数据移动（shift）
print("原始数据前5行：")
print(ts.head())

# 数据后移2位
print("\n数据后移2位：")
print(ts.shift(periods=2).head())

# 数据前移2位
print("\n数据前移2位：")
print(ts.shift(periods=-2).head())

# 2. 日期移动
# 使用 DateOffset 进行日期移动
print("\n日期向后移动2天：")
print(ts.shift(freq=pd.DateOffset(days=2)).head())

# 日期向后移动1个月
print("\n日期向后移动1个月：")
print(ts.shift(freq=pd.DateOffset(months=1)).head())

# 3. 更多时间移动示例
print("\n更多时间移动示例：")

# 移动工作日
print("移动2个工作日：")
print(ts.shift(freq=pd.DateOffset(days=2, normalize=True)).head())

# 移动到月末
print("\n移动到月末：")
print(ts.shift(freq=pd.offsets.MonthEnd(1)).head())

# 4. 不同频率的移动
# 按周移动
print("\n按周移动：")
print(ts.shift(freq=pd.DateOffset(weeks=1)).head())

# 按季度移动
print("\n按季度移动：")
print(ts.shift(freq=pd.DateOffset(months=3)).head())

# 5. 组合多个时间偏移
print("\n组合时间偏移：")
offset = pd.DateOffset(months=1, days=5, hours=12)
print(ts.shift(freq=offset).head())


ts.asfreq(pd.tseries.offsets.Week()) # 天变周
ts.asfreq(pd.tseries.offsets.MonthEnd()) # 天变⽉
ts.asfreq(pd.tseries.offsets.Hour(),fill_value = 0) #天变⼩时，⼜少变多，fill_value为填充值



# 3、重采样
# resample 表示根据⽇期维度进⾏数据聚合，可以按照分钟、⼩时、⼯作⽇、周、⽉、年等来作为⽇期维度
ts.resample('2W').sum() # 以2周为单位进⾏汇总
ts.resample('3M').sum().cumsum() # 以季度为单位进⾏汇总

# 4、 DataFrame重采样
d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
    'volume': [50, 60, 40, 100, 50, 100, 40, 50],
    'week_starting':pd.date_range('24/08/2020',periods=8,freq='W')})
df1 = pd.DataFrame(d)
df1.resample('M',on = 'week_starting').apply(np.sum)
df1.resample('M',on = 'week_starting').agg({'price':'mean','volume':'sum'})
days = pd.date_range('1/8/2020', periods=4, freq='D')
data2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],
    'volume': [50, 60, 40, 100, 50, 100, 40, 50]})
df2 = pd.DataFrame(data2,
    index=pd.MultiIndex.from_product([days,
    ['morning','afternoon']]))
df2.resample('D', level=0).sum()

更多采样：

# 1. 不同的时间频率重采样
frequencies = {
    '周': df1.resample('W', on='week_starting').mean(),
    '月': df1.resample('M', on='week_starting').mean(),
    '季度': df1.resample('Q', on='week_starting').mean()
}

for freq_name, result in frequencies.items():
    print(f"\n按{freq_name}重采样：")
    print(result)

# 2. 使用不同的填充方法
print("\n处理重采样后的缺失值：")
resampled = df1.resample('D', on='week_starting').mean()
print("\n前向填充：")
print(resampled.fillna(method='ffill').head())
print("\n后向填充：")
print(resampled.fillna(method='bfill').head())

# 3. 计算滚动统计量
print("\n计算7天滚动平均：")
print(df1.set_index('week_starting')['price'].rolling('7D').mean())

# 4. 重采样并计算百分比变化
print("\n计算价格月度变化百分比：")
monthly_price = df1.resample('M', on='week_starting')['price'].mean()
print(monthly_price.pct_change())

时区

index = pd.date_range('8/1/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(index)), index)
import pytz
pytz.common_timezones # 常⽤时区
# 时区表示
ts = ts.tz_localize(tz='UTC')
# 转换成其它时区
ts.tz_convert(tz = 'Asia/Shanghai')