Python 工资分类预测
大约 15 分钟教学文档Python
Python 案例-工资分类预测
1.项目目标
使用美国人口普查收入数据集,根据人口普查数据预测个人收入是否超过每年50,000美元。
2.数据来源
数据集地址: https://archive.ics.uci.edu/ml/datasets/adult
3.数据下载
下载数据并将其保存到我们的本地目录中名为“dataset”的文件夹中。 注意:下载一次,然后将代码注释掉以便后续运行。
# 下载
DATASET = (
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
)
def download_data(path='dataset', urls=DATASET):
if not os.path.exists(path):
os.mkdir(path)
for url in urls:
response = requests.get(url)
name = os.path.basename(url)
with open(os.path.join(path, name), 'wb+') as f:
f.write(response.content)
print('-----',str(urls.index(url)))
#download_data() # 第一次运行后注释掉
4.数据整理
- 了解数据相关特征
- 了解特征间关系
- 了解要预测的目标
4.1 数据读取
# 加载训练集和测试集
# 字段名
headers = ['age', 'workclass', 'fnlwgt',
'education', 'education-num',
'marital-status', 'occupation',
'relationship', 'race', 'sex',
'capital-gain', 'capital-loss',
'hours-per-week', 'native-country',
'predclass']
# 加载训练集
training_raw = pd.read_csv('dataset/adult.data',
header=None,
names=headers,
sep=',\s',
na_values=["?"],
engine='python')
# 加载测试集
test_raw = pd.read_csv('dataset/adult.test',
header=None,
names=headers,
sep=',\s',
na_values=["?"],
engine='python',
skiprows=1)
test_raw.shape
test_raw.head()
4.2 数据整合
将训练集和测试集加在一起做分析。通常情况下,只做训练集的分析,因为一开始是拿不到测试集的。
# 训练集和测试集加到一起做分析
dataset_raw = training_raw.append(test_raw)
dataset_raw.reset_index(inplace=True) #不创建新的对象,直接对原始对象进行修改; reset_index用于将DataFrame中的索引重置为默认的数字索引。
# dataset_raw.drop('index',inplace=True,axis=1)#drop是通过指定的索引或标签名称,也就是行名称或者列名称进行删除数据 axis=1 表示按列删除
dataset_raw.head()
5. 数据探索
5.1 数据质量分析
- 缺失值分析
- 离群点分析
- 不一致数据分析等。
#查看缺失值个数
missingno.matrix(dataset_raw, figsize = (30,5))
missingno.bar(dataset_raw, sort='ascending', figsize = (30,5))
5.2 单一变量数据分析
- 数值型
- 标称型
# 显示所有数字型特征
dataset_raw.describe() #输出数字型的统计特征
# 标称型特征 输出标称型的特征
dataset_raw.describe(include=['O'])# 注意这里是大写的字母 O 而不是小字字母o 或 数字 0
5.3 数据分布分析
# 绘制每个特征的分布
def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
plt.style.use('seaborn-whitegrid')#plt的样式,还有dark white等。
fig = plt.figure(figsize=(width,height)) #图的大小
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)#子图的设置
rows = math.ceil(float(dataset.shape[1]) / cols) #子图的行数
for i, column in enumerate(dataset.columns): #分析对各列的数据进行分析
ax = fig.add_subplot(rows, cols, i + 1) #给出子图中的位置
ax.set_title(column) #每个子图的标题,用列名表示
if dataset.dtypes[column] == np.object:#标称型数据处理
g = sns.countplot(y=column, data=dataset)
substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
g.set(yticklabels=substrings)
plt.xticks(rotation=25)
else:#数值型数据分析
g = sns.distplot(dataset[column])
plt.xticks(rotation=25)
plot_distribution(dataset_raw, cols=3, width=20, height=20, hspace=0.45, wspace=0.5)
6.特征分析
# 创建新的DataFrame
dataset_bin = pd.DataFrame() # 包含所有离散化后的值
dataset_con = pd.DataFrame() # 包含所有未离散化的值
6.1 特征:Predclass
预测目标,也称为类标号,需要将目前的值,转换成0 和 1 ,年收入超过$50为1.
# 转换
dataset_raw.loc[dataset_raw['predclass'] == '>50K', 'predclass'] = 1
dataset_raw.loc[dataset_raw['predclass'] == '>50K.', 'predclass'] = 1 #有的值后面有一个'.'
dataset_raw.loc[dataset_raw['predclass'] == '<=50K', 'predclass'] = 0
dataset_raw.loc[dataset_raw['predclass'] == '<=50K.', 'predclass'] = 0 #有的值后面有一个'.'
dataset_bin['predclass'] = dataset_raw['predclass']
dataset_con['predclass'] = dataset_raw['predclass']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(10,1))
sns.countplot(y="predclass", data=dataset_bin);#画类别特征的频数条形图
6.2 特征:Age
Age可以说是一个连续数据,所以需要分箱。使用Pandas Cut功能将数据分成大小相同的箱。原始数据放到dataset_con中。
dataset_bin['age'] = pd.cut(dataset_raw['age'], 10) # 离散化
#练习 卡方分箱 代码查阅资料
dataset_con['age'] = dataset_raw['age'] # 未离散化
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
sns.countplot(y="age", data=dataset_bin);
plt.subplot(1, 2, 2)
# 带趋势线的直方图
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 1]['age'], kde_kws={"label": ">$50K"});
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 0]['age'], kde_kws={"label": "<$50K"});
6.3 特征:Workclass
# 如何处理?
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,3))
sns.countplot(y="workclass", data=dataset_raw);
# 减少类别数目 将一些值合并
dataset_raw.loc[dataset_raw['workclass'] == 'Without-pay', 'workclass'] = 'Not Working'
dataset_raw.loc[dataset_raw['workclass'] == 'Never-worked', 'workclass'] = 'Not Working'
dataset_raw.loc[dataset_raw['workclass'] == 'Federal-gov', 'workclass'] = 'Fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'State-gov', 'workclass'] = 'Non-fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'Local-gov', 'workclass'] = 'Non-fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'Self-emp-not-inc', 'workclass'] = 'Self-emp'
dataset_raw.loc[dataset_raw['workclass'] == 'Self-emp-inc', 'workclass'] = 'Self-emp'
dataset_bin['workclass'] = dataset_raw['workclass']
dataset_con['workclass'] = dataset_raw['workclass']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,2))
sns.countplot(y="workclass", data=dataset_bin);
6.4 特征:Occupation
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,5))
sns.countplot(y="occupation", data=dataset_raw);
# 属性融合
dataset_raw.loc[dataset_raw['occupation'] == 'Adm-clerical', 'occupation'] = 'Admin'
dataset_raw.loc[dataset_raw['occupation'] == 'Armed-Forces', 'occupation'] = 'Military'
dataset_raw.loc[dataset_raw['occupation'] == 'Craft-repair', 'occupation'] = 'Manual Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Exec-managerial', 'occupation'] = 'Office Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Farming-fishing', 'occupation'] = 'Manual Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Handlers-cleaners', 'occupation'] = 'Manual Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Machine-op-inspct', 'occupation'] = 'Manual Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Other-service', 'occupation'] = 'Service'
dataset_raw.loc[dataset_raw['occupation'] == 'Priv-house-serv', 'occupation'] = 'Service'
dataset_raw.loc[dataset_raw['occupation'] == 'Prof-specialty', 'occupation'] = 'Professional'
dataset_raw.loc[dataset_raw['occupation'] == 'Protective-serv', 'occupation'] = 'Military'
dataset_raw.loc[dataset_raw['occupation'] == 'Sales', 'occupation'] = 'Office Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Tech-support', 'occupation'] = 'Office Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Transport-moving', 'occupation'] = 'Manual Labour'
dataset_bin['occupation'] = dataset_raw['occupation']
dataset_con['occupation'] = dataset_raw['occupation']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
sns.countplot(y="occupation", data=dataset_bin);
6.5 特征:Native Country
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,10))
sns.countplot(y="native-country", data=dataset_raw);
dataset_raw.loc[dataset_raw['native-country'] == 'Cambodia' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Canada' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'China' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Columbia' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Cuba' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Dominican-Republic' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Ecuador' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'El-Salvador' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'England' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'France' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Germany' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Greece' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Guatemala' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Haiti' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Holand-Netherlands' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Honduras' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Hong' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Hungary' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'India' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'Iran' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Ireland' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'Italy' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Jamaica' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Japan' , 'native-country'] = 'APAC'
dataset_raw.loc[dataset_raw['native-country'] == 'Laos' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Mexico' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Nicaragua' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Outlying-US(Guam-USVI-etc)' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Peru' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Philippines' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Poland' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Portugal' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Puerto-Rico' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Scotland' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'South' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Taiwan' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Thailand' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Trinadad&Tobago' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'United-States' , 'native-country'] = 'United-States'
dataset_raw.loc[dataset_raw['native-country'] == 'Vietnam' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Yugoslavia' , 'native-country'] = 'Euro_Group_2'
dataset_bin['native-country'] = dataset_raw['native-country']
dataset_con['native-country'] = dataset_raw['native-country']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="native-country", data=dataset_bin);
6.6 特征:Education
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,5))
sns.countplot(y="education", data=dataset_raw);
dataset_raw.loc[dataset_raw['education'] == '10th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '11th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '12th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '1st-4th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '5th-6th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '7th-8th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '9th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == 'Assoc-acdm' , 'education'] = 'Associate'
dataset_raw.loc[dataset_raw['education'] == 'Assoc-voc' , 'education'] = 'Associate'
dataset_raw.loc[dataset_raw['education'] == 'Bachelors' , 'education'] = 'Bachelors'
dataset_raw.loc[dataset_raw['education'] == 'Doctorate' , 'education'] = 'Doctorate'
dataset_raw.loc[dataset_raw['education'] == 'HS-Grad' , 'education'] = 'HS-Graduate'
dataset_raw.loc[dataset_raw['education'] == 'Masters' , 'education'] = 'Masters'
dataset_raw.loc[dataset_raw['education'] == 'Preschool' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == 'Prof-school' , 'education'] = 'Professor'
dataset_raw.loc[dataset_raw['education'] == 'Some-college' , 'education'] = 'HS-Graduate'
dataset_bin['education'] = dataset_raw['education']
dataset_con['education'] = dataset_raw['education']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="education", data=dataset_bin);
6.7 特征: Marital Status
plt.figure(figsize=(20,3))
sns.countplot(y="marital-status", data=dataset_raw);
dataset_raw.loc[dataset_raw['marital-status'] == 'Never-married' , 'marital-status'] = 'Never-Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-AF-spouse' , 'marital-status'] = 'Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-civ-spouse' , 'marital-status'] = 'Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-spouse-absent', 'marital-status'] = 'Not-Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Separated' , 'marital-status'] = 'Separated'
dataset_raw.loc[dataset_raw['marital-status'] == 'Divorced' , 'marital-status'] = 'Separated'
dataset_raw.loc[dataset_raw['marital-status'] == 'Widowed' , 'marital-status'] = 'Widowed'
dataset_bin['marital-status'] = dataset_raw['marital-status']
dataset_con['marital-status'] = dataset_raw['marital-status']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
sns.countplot(y="marital-status", data=dataset_bin);
6.8 特征:Final Weight
# 体重分箱
dataset_bin['fnlwgt'] = pd.cut(dataset_raw['fnlwgt'], 10)
dataset_con['fnlwgt'] = dataset_raw['fnlwgt']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="fnlwgt", data=dataset_bin);
6.9 特征: Education Number
# 教育等级分箱
dataset_bin['education-num'] = pd.cut(dataset_raw['education-num'], 10)
dataset_con['education-num'] = dataset_raw['education-num']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
sns.countplot(y="education-num", data=dataset_bin);
6.10 特征:Hours per Week
# 周工作时间(小时)分箱
dataset_bin['hours-per-week'] = pd.cut(dataset_raw['hours-per-week'], 10)
dataset_con['hours-per-week'] = dataset_raw['hours-per-week']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
plt.subplot(1, 2, 1)
# 分箱前
sns.countplot(y="hours-per-week", data=dataset_bin);
plt.subplot(1, 2, 2)
# 分箱后
sns.distplot(dataset_con['hours-per-week']);
6.11 特征:Captital Gain
dataset_bin['capital-gain'] = pd.cut(dataset_raw['capital-gain'], 5)
# 重新分箱处理
dataset_con['capital-gain'] = dataset_raw['capital-gain']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
plt.subplot(1, 2, 1)
sns.countplot(y="capital-gain", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['capital-gain']);
6.12 特征:Capital Loss
dataset_bin['capital-loss'] = pd.cut(dataset_raw['capital-loss'], 5)
dataset_con['capital-loss'] = dataset_raw['capital-loss']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
plt.subplot(1, 2, 1)
sns.countplot(y="capital-loss", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['capital-loss']);
6.13 特征:Race, Sex, Relationship
# 无需处理
dataset_con['sex'] = dataset_bin['sex'] = dataset_raw['sex']
dataset_con['race'] = dataset_bin['race'] = dataset_raw['race']
dataset_con['relationship'] = dataset_bin['relationship'] = dataset_raw['relationship']
7.特征生成
例:Age 与 Hours Per Week的融合与特征生成,即生成一个新的特征,一般需要对领域数据有深入了解的专家来做。
# 生成
dataset_con['age-hours'] = dataset_con['age'] * dataset_con['hours-per-week']
dataset_bin['age-hours'] = pd.cut(dataset_con['age-hours'], 10)
dataset_con['age-hours'] = dataset_con['age-hours']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
sns.countplot(y="age-hours", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 1]['age-hours'], kde_kws={"label": ">$50K"});
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 0]['age-hours'], kde_kws={"label": "<$50K"});
例:sex 与 Marital-status
dataset_bin['sex-marital'] = dataset_con['sex-marital'] = dataset_con['sex'] + dataset_con['marital-status']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
sns.countplot(y="sex-marital", data=dataset_bin);
dataset_bin.head()
在进行接下来的任务之前,先对有缺失值的数据进行处理,以及其它数据质量相关的操作
# 适用(缺失值数目比较少)
dataset_bin = dataset_bin.dropna(axis=0)
dataset_con = dataset_con.dropna(axis=0)
dataset_con['fnlwgt']/=1000
8.特征编码
因为机器学习算法接收的是数字型变量,所以需要把字符型编码为数字型。经常用到的方法有:
- One-Hot
- Label encoding
# 所有变量进行One-Hot编码(离散)
one_hot_cols = dataset_bin.columns.tolist()
one_hot_cols.remove('predclass')
# One-Hot编码
dataset_bin_enc = pd.get_dummies(dataset_bin, columns=one_hot_cols)
dataset_bin_enc.head()
# 转换为连续型数字变量
encoder = LabelEncoder()
# print(dataset_con['workclass'])
dataset_con = dataset_con.astype(str)
dataset_con_enc = dataset_con.apply(encoder.fit_transform)
dataset_con_enc.head(20)
9.特征归约
一般有降维,选择等方法
降维:PCA(主成分分析法) SVD(奇异值分解)
选择:
- 过滤: 过滤方法仅根据一般度量标准选择要素,例如与要预测的变量的相关性。 过滤方法抑制最不感兴趣的变量。其他变量将是用于分类或预测数据的分类或回归模型的一部分。 这些方法在计算时间方面特别有效,并且对过度拟合具有鲁棒性。
- 包装: 包装方法评估变量子集,与过滤器方法不同,它允许检测变量之间可能的相互作用。 这些方法的两个主要缺点是:1.当观测数量不足时,过度拟合风险增加;2.变量数量很大时的显着计算时间。
- 嵌入式: 嵌入式方法试图结合两种先前方法的优点。 学习算法利用其自己的变量选择过程并同时执行特征选择和分类。
9.1 特征的相关性分析
相关性衡量两个随机变量一起变化的程度。 期望:属性应该彼此不相关,并且与我们试图预测的目标高度相关。 卡方检测,皮尔逊系数,Spearman系数等
# 绘制两个数据集的关系图.
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,10))
plt.subplot(1, 2, 1)
# 关系热力图
mask = np.zeros_like(dataset_bin_enc.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_bin_enc.corr(),
vmin=-1, vmax=1,
square=True,
cmap=sns.color_palette("RdBu_r", 100),
mask=mask,
linewidths=.5);
plt.subplot(1, 2, 2)
mask = np.zeros_like(dataset_con_enc.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_con_enc.corr(),
vmin=-1, vmax=1,
square=True,
cmap=sns.color_palette("RdBu_r", 100),
mask=mask,
linewidths=.5);
- 基于随机森林的特征分析
# 使用随机森林算法去查看属性重要性
clf = RandomForestClassifier()
clf.fit(dataset_con_enc.drop('predclass', axis=1), dataset_con_enc['predclass'])
plt.style.use('seaborn-whitegrid')
importance = clf.feature_importances_
importance = pd.DataFrame(importance, index=dataset_con_enc.drop('predclass', axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh', figsize=(20,len(importance)/2));
- PCA
# 计算PCA并绘制方差
std_scale = preprocessing.StandardScaler().fit(dataset_bin_enc.drop('predclass', axis=1))
X = std_scale.transform(dataset_bin_enc.drop('predclass', axis=1))
pca1 = PCA(n_components=len(dataset_bin_enc.columns)-1)
fit1 = pca1.fit(X)
std_scale = preprocessing.StandardScaler().fit(dataset_con_enc.drop('predclass', axis=1))
X = std_scale.transform(dataset_con_enc.drop('predclass', axis=1))
pca2 = PCA(n_components=len(dataset_con_enc.columns)-2)
fit2 = pca2.fit(X)
# 绘制每个属性的方差
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(25,7))
plt.subplot(1, 2, 1)
plt.xlabel('PCA Feature')
plt.ylabel('Variance')
plt.title('PCA for Discretised Dataset')
plt.bar(range(0, fit1.explained_variance_ratio_.size), fit1.explained_variance_ratio_);
plt.subplot(1, 2, 2)
plt.xlabel('PCA Feature')
plt.ylabel('Variance')
plt.title('PCA for Continuous Dataset')
plt.bar(range(0, fit2.explained_variance_ratio_.size), fit2.explained_variance_ratio_);
# PCA元素的二维、三维显示
std_scale = preprocessing.StandardScaler().fit(dataset_con_enc.drop('predclass', axis=1))
X = std_scale.transform(dataset_con_enc.drop('predclass', axis=1))
y = dataset_con_enc['predclass']
# 格式
target_names = [0,1]
colors = ['navy','darkorange']
lw = 2
alpha = 0.3
# 2个最优属性
plt.style.use('seaborn-whitegrid')
plt.figure(2, figsize=(20, 8))
plt.subplot(1, 2, 1)
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
for color, i, target_name in zip(colors, [0, 1], target_names):
plt.scatter(X_r[y == i, 0], X_r[y == i, 1],
color=color,
alpha=alpha,
lw=lw,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('First two PCA directions');
# 3个最优属性
ax = plt.subplot(1, 2, 2, projection='3d')
pca = PCA(n_components=3)
X_reduced = pca.fit(X).transform(X)
# print(X_reduced.shape)
for color, i, target_name in zip(colors, [0, 1], target_names):
ax.scatter(X_reduced[y == i, 0], X_reduced[y == i, 1], X_reduced[y == i, 2],
color=color,
alpha=alpha,
lw=lw,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.set_ylabel("2nd eigenvector")
ax.set_zlabel("3rd eigenvector")
# 三维旋转
ax.view_init(30, 10)
10.数据建模
10.1 数据集选择
# 可选的数据集:
# - dataset_bin_enc(离散编码后)
# - dataset_con_enc(连续编码后)
# 都进行机器学习,最后对比效果
selected_dataset = dataset_con_enc
10.2 数据集划分
# 拆分训练集和测试集
train = selected_dataset.loc[0:32560,:]
test = selected_dataset.loc[32560:,:]
#重命名属性和目标
X_train_w_label = train
X_train = train.drop(['predclass'], axis=1)
y_train = train['predclass'].astype('int64')
X_test = test.drop(['predclass'], axis=1)
y_test = test['predclass'].astype('int64')
10.3 数据模型
用到的算法:
- KNN
- Logistic Regression
- Random Forest
- Naive Bayes
- Stochastic Gradient Decent
- Linear SVC
- Decision Tree
- Gradient Boosted Trees
10.3.1 建模前准备
- ROC曲线绘制
# 绘制ROC曲线
def plot_roc_curve(y_test, preds):
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
- 模型训练函数
# 训练模型,并输出预测结果
def fit_ml_algo(algo, X_train, y_train, X_test, cv):
# 训练
model = algo.fit(X_train, y_train)
# 预测
test_pred = model.predict(X_test)
if (isinstance(algo, (LogisticRegression,
KNeighborsClassifier,
GaussianNB,
DecisionTreeClassifier,
RandomForestClassifier,
GradientBoostingClassifier))):
probs = model.predict_proba(X_test)[:,1]
else:
probs = "Not Available"
acc = round(model.score(X_test, y_test) * 100, 2)
# 交叉验证
train_pred = model_selection.cross_val_predict(algo,
X_train,
y_train,
cv=cv,
) # 所有CPU核
acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
return train_pred, test_pred, acc, acc_cv, probs
10.3.2 逻辑回归
- 参数选择
# 逻辑回归(随机选择最优参数)
# 输出最佳参数的统一方法
def report(results, n_top=5):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# 设置参数
param_dist = {'penalty': ['l2', 'l1'],
'class_weight': [None, 'balanced'],
'C': np.logspace(-20, 20, 10000),
'intercept_scaling': np.logspace(-20, 20, 10000)}
# 随机选取参数
n_iter_search = 20
lrc = LogisticRegression()
random_search = RandomizedSearchCV(lrc,
n_jobs=1,
param_distributions=param_dist,
n_iter=n_iter_search)
start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
- 训练
# 训练
start_time = time.time()
train_pred_log, test_pred_log, acc_log, acc_cv_log, probs_log = fit_ml_algo(LogisticRegression(n_jobs = 1),
X_train,
y_train,
X_test,
10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))
- 测试
print(metrics.classification_report(y_train, train_pred_log) )
print( metrics.classification_report(y_test, test_pred_log) )
plot_roc_curve(y_test, probs_log) #ROC曲线
10.3.3 KNN
# KNN
start_time = time.time()
train_pred_knn, test_pred_knn, acc_knn, acc_cv_knn, probs_knn = fit_ml_algo(KNeighborsClassifier(n_neighbors = 3,
),
X_train,
y_train,
X_test,
10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))
print (metrics.classification_report(y_train, train_pred_knn) )
print (metrics.classification_report(y_test, test_pred_knn) )
plot_roc_curve(y_test, probs_knn)
10.3.4 Naive Bayes
# Gaussian Naive Bayes
start_time = time.time()
train_pred_gaussian, test_pred_gaussian, acc_gaussian, acc_cv_gaussian, probs_gau = fit_ml_algo(GaussianNB(),
X_train,
y_train,
X_test,
10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussian)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))
print (metrics.classification_report(y_train, train_pred_gaussian) )
print( metrics.classification_report(y_test, test_pred_gaussian) )
plot_roc_curve(y_test, probs_gau)
10.3.5 Linear SVC
# Linear SVC
start_time = time.time()
train_pred_svc, test_pred_svc, acc_linear_svc, acc_cv_linear_svc, _ = fit_ml_algo(LinearSVC(),
X_train,
y_train,
X_test,
10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))
print (metrics.classification_report(y_train, train_pred_svc) )
print (metrics.classification_report(y_test, test_pred_svc))
10.3.6 决策树
# 决策树
start_time = time.time()
train_pred_dt, test_pred_dt, acc_dt, acc_cv_dt, probs_dt = fit_ml_algo(DecisionTreeClassifier(),
X_train,
y_train,
X_test,
10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dt)
print("Accuracy CV 10-Fold: %s" % acc_cv_dt)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))
print (metrics.classification_report(y_train, train_pred_dt) )
print (metrics.classification_report(y_test, test_pred_dt) )
plot_roc_curve(y_test, probs_dt)
10.3.7 随机森林
# 随机森林
# 输出参数得分
def report(results, n_top=5):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# 设置参数范围
param_dist = {"max_depth": [10, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 20),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# 随机搜索
n_iter_search = 10
rfc = RandomForestClassifier(n_estimators=10)
random_search = RandomizedSearchCV(rfc,
n_jobs = -1,
param_distributions=param_dist,
n_iter=n_iter_search)
start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
# 训练
start_time = time.time()
rfc = RandomForestClassifier(n_estimators=10,
min_samples_leaf=2,
min_samples_split=17,
criterion='gini',
max_features=8)
train_pred_rf, test_pred_rf, acc_rf, acc_cv_rf, probs_rf = fit_ml_algo(rfc,
X_train,
y_train,
X_test,
10)
rf_time = (time.time() - start_time)
print("Accuracy: %s" % acc_rf)
print("Accuracy CV 10-Fold: %s" % acc_cv_rf)
print("Running Time: %s" % datetime.timedelta(seconds=rf_time))
print(metrics.classification_report(y_train, train_pred_rf) )
print( metrics.classification_report(y_test, test_pred_rf) )
plot_roc_curve(y_test, probs_rf)
10.3.8 Gradient Boosting Trees
# Gradient Boosting Trees
start_time = time.time()
train_pred_gbt, test_pred_gbt, acc_gbt, acc_cv_gbt, probs_gbt = fit_ml_algo(GradientBoostingClassifier(),
X_train,
y_train,
X_test,
10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbt)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbt)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))
print(metrics.classification_report(y_train, train_pred_gbt) )
print( metrics.classification_report(y_test, test_pred_gbt) )
plot_roc_curve(y_test, probs_gbt)
10.3.9 模型性能对比
models = pd.DataFrame({
'Model': ['KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Linear SVC',
'Decision Tree', 'Gradient Boosting Trees'],
'Score': [
acc_knn,
acc_log,
acc_rf,
acc_gaussian,
acc_linear_svc,
acc_dt,
acc_gbt
]})
models.sort_values(by='Score', ascending=False)
models = pd.DataFrame({
'Model': ['KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Linear SVC',
'Decision Tree', 'Gradient Boosting Trees'],
'Score': [
acc_cv_knn,
acc_cv_log,
acc_cv_rf,
acc_cv_gaussian,
acc_cv_linear_svc,
acc_cv_dt,
acc_cv_gbt
]})
models.sort_values(by='Score', ascending=False)
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(6,6))
models = [
'KNN',
'Logistic Regression',
'Random Forest',
'Naive Bayes',
'Decision Tree',
'Gradient Boosting Trees'
]
probs = [
probs_knn,
probs_log,
probs_rf,
probs_gau,
probs_dt,
probs_gbt
]
colors = [
'blue',
'green',
'red',
'cyan',
'magenta',
'yellow',
]
plt.title('Receiver Operating Characteristic')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
def plot_roc_curves(y_test, prob, model):
fpr, tpr, threshold = metrics.roc_curve(y_test, prob)
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label = model + ' AUC = %0.2f' % roc_auc, color=colors[i])
plt.legend(loc = 'lower right')
for i, model in list(enumerate(models)):
plot_roc_curves(y_test, probs[i], models[i])
plt.show()
