import pandas as pd
import numpy as np
data = pd.read_csv('./input/dress-recommendation.csv')
data.head()
data.info()
data['Style'].value_counts()
dummpy_list = ['Style', 'Price', 'Size', 'Season', 'NeckLine', 'SleeveLength', 'waiseline','Material', 'FabricType','Decoration','Pattern Type']
transfer_list = [pd.get_dummies(data[[feature]], prefix = feature+'_') for feature in dummpy_list]
transfer_list.append(data[['Rating','Recommendation']])
new_data = pd.concat(transfer_list, axis = 1)
new_data['Recommendation'].value_counts()
def evaluate(pred,test_y):
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
# 输出分类的准确率
print("Accuracy: %.4f" % (metrics.accuracy_score(test_y,pred)))
# 输出衡量分类效果的各项指标
print(metrics.classification_report(test_y, pred))
# 更直观的,我们通过seaborn画出混淆矩阵
%matplotlib inline
plt.figure(figsize=(6,4))
colorMetrics = metrics.confusion_matrix(test_y,pred)
# 坐标y代表test_y,即真实的类别,坐标x代表估计出的类别pred
sns.heatmap(colorMetrics, annot=True, fmt='d', xticklabels=[0,1],yticklabels=[0,1])
sns.plt.show()
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_data.iloc[:,:-1], new_data['Recommendation'], test_size = .3, random_state = 0)
协方差矩阵的取值为: {‘full’, ‘tied’, ‘diag’, ‘spherical’},默认取值为full
。
高斯混合模型的每个组成部分(component)都有自己独立的协方差矩阵
高斯混合模型的所有组成部分(component)都分享相同的协方差矩阵
高斯混合模型的每个组成部分(component)都有自己的对角化的协方差矩阵
斯混合模型的每个组成部分(component)拥有一个方差
cov_type = ['full', 'tied', 'diag', 'spherical']
model_list = [GaussianMixture(n_components = 2, covariance_type = cov_name, init_params='random', max_iter=20) for cov_name in cov_type]
# 标签
reco_label = np.unique(y_train)
mean_list = np.array([X_train[y_train == value].mean(axis = 0) for value in reco_label])
mean_list
for gmm_model in model_list:
gmm_model.means_ = mean_list
gmm_model.fit(X_train)
y_pred = gmm_model.predict(X_test)
evaluate(y_test, y_pred)