1、2021/11/21 下午4:36 5-1-2file:/C:/Users/sgl/Downloads/5-1-2.html 1/2In:import pandas as pd pd.set_option(display.unicode.east_asian_width, True)#解决数据输出时列名不对齐的问题 df=pd.read_excel(tdata/cj.xlsx) #读取数据 5.1.2数据清洗-缺失值处理In:import pandas as pd pd.set_option(display.unicode.east_asian_width, True)#解决数据输出时列名不对
2、齐的问题 df=pd.read_excel(tdata/cj.xlsx) #读取数据 In:#存在任一缺失值即删除 df1=df.dropna() print(删除前:,df.shape) print(删除后:,df1.shape) In:#所有列均为缺失值即删除 df1=df.dropna(how=all) print(删除前:,df.shape) print(删除后:,df1.shape) In:#指定列均为缺失值即删除 df1=df.dropna(how=all,subset=专业,选修) print(删除前:,df.shape) print(删除后:,df1.shape) In:#保留
3、某些属性中不存在缺失值的情况 df1=dfdf性别.notnull() print(删除前:,df.shape) print(删除后:,df1.shape) In:#将缺失值NaN填充为0 df选修.fillna(0) In:#将缺失值NaN填充与后面的值相同 df选修.fillna(method=bfill) 2021/11/21 下午4:36 5-1-2file:/C:/Users/sgl/Downloads/5-1-2.html 2/2In:import numpy as np #将缺失值NaN填充选修课的平均分 df选修.fillna(np.mean(df选修) -重复值处理In:#去
4、除全部重复数据 df1=df.drop_duplicates() print(去重前:,df.shape) print(去重后:,df1.shape) In:#去除指定列中重复数据 df1=df.drop_duplicates(专业) print(去重前:,df.shape) print(去重后:,df1.shape) In:#去除指定列中重复数据,设置keep参数 df1=df.drop_duplicates(专业,keep=last) print(去重前:,df.shape) print(去重后:,df1.shape) In:#去除指定若干列中重复数据 df1=df.drop_duplicates(学号,姓名) print(去重前:,df.shape) print(去重后:,df1.shape)