包版本信息

In [3]:
!pip freeze | grep numpy
!pip freeze | grep pandas
!pip freeze | grep scikit-learn
numpy==1.13.1
numpydoc==0.6.0
pandas==0.20.3
scikit-learn==0.19.0

数据预处理

In [57]:
import numpy as np
import pandas as pd

## 隐藏warning信息
import warnings
warnings.filterwarnings('ignore')

由于数据集以txt文件存储,使用Pandasread_table()函数读入数据,分隔符为\t

In [58]:
data = pd.read_table('seed.txt', sep='\t', names = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7','label'])
data.head()
Out[58]:
x1 x2 x3 x4 x5 x6 x7 label
0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 1
1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 1
2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 1
3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 1
4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 1

然后,我们将data['label']的取值减1, 标签取值范围为[0, 1, 2]

In [59]:
data['label'] = data['label'] - 1
data.head()
Out[59]:
x1 x2 x3 x4 x5 x6 x7 label
0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 0
1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 0
2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 0
3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 0
4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 0
In [60]:
data.shape
Out[60]:
(210, 8)
In [61]:
# 检查有无缺失值
np.sum(data.isnull())
Out[61]:
x1       0
x2       0
x3       0
x4       0
x5       0
x6       0
x7       0
label    0
dtype: int64
In [62]:
# 标签label的取值1,2,3
data['label'].value_counts()
Out[62]:
2    70
1    70
0    70
Name: label, dtype: int64
In [63]:
class_label = np.unique(data['label'])
class_label
Out[63]:
array([0, 1, 2])
In [64]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
x1       210 non-null float64
x2       210 non-null float64
x3       210 non-null float64
x4       210 non-null float64
x5       210 non-null float64
x6       210 non-null float64
x7       210 non-null float64
label    210 non-null int64
dtypes: float64(7), int64(1)
memory usage: 13.2 KB
In [65]:
data.describe()
Out[65]:
x1 x2 x3 x4 x5 x6 x7 label
count 210.000000 210.000000 210.000000 210.000000 210.000000 210.000000 210.000000 210.000000
mean 14.847524 14.559286 0.870999 5.628533 3.258605 3.700201 5.408071 1.000000
std 2.909699 1.305959 0.023629 0.443063 0.377714 1.503557 0.491480 0.818448
min 10.590000 12.410000 0.808100 4.899000 2.630000 0.765100 4.519000 0.000000
25% 12.270000 13.450000 0.856900 5.262250 2.944000 2.561500 5.045000 0.000000
50% 14.355000 14.320000 0.873450 5.523500 3.237000 3.599000 5.223000 1.000000
75% 17.305000 15.715000 0.887775 5.979750 3.561750 4.768750 5.877000 2.000000
max 21.180000 17.250000 0.918300 6.675000 4.033000 8.456000 6.550000 2.000000

分类建模

逻辑回归

In [19]:
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
In [20]:
def evaluate(pred,test_y):
    
    import seaborn as sns
    import matplotlib.pyplot as plt
    import sklearn.metrics as metrics
    # 输出分类的准确率
    print("Accuracy: %.4f"  % (metrics.accuracy_score(test_y,pred)))
    
    # 输出衡量分类效果的各项指标
    print(classification_report(test_y, pred)) 
    
    # 更直观的,我们通过seaborn画出混淆矩阵
    %matplotlib inline
    plt.figure(figsize=(6,4))
    colorMetrics = metrics.confusion_matrix(test_y,pred)
    
    # 坐标y代表test_y,即真实的类别,坐标x代表估计出的类别pred
    sns.heatmap(colorMetrics, annot=True, fmt='d', xticklabels=[1,2,3],yticklabels=[1,2,3])
    sns.plt.show()
In [23]:
## 数据准备

X = data.iloc[:, :-1]
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)
In [24]:
lr = LogisticRegression()
param_dict = {'C': [0.001, 0.01, 0.1, 1, 10]}
gs = GridSearchCV(lr, param_grid = param_dict, scoring='accuracy')
gs.fit(X_train, y_train)
Out[24]:
GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)
In [25]:
pd.DataFrame(gs.cv_results_)
Out[25]:
mean_fit_time mean_score_time mean_test_score mean_train_score param_C params rank_test_score split0_test_score split0_train_score split1_test_score split1_train_score split2_test_score split2_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.015219 0.002470 0.382166 0.381955 0.001 {u'C': 0.001} 5 0.415094 0.355769 0.377358 0.375000 0.352941 0.415094 0.019821 0.003011 0.025520 0.024714
1 0.001227 0.000423 0.783439 0.764393 0.01 {u'C': 0.01} 4 0.754717 0.778846 0.867925 0.759615 0.725490 0.754717 0.000030 0.000113 0.061473 0.010414
2 0.002254 0.000865 0.885350 0.882196 0.1 {u'C': 0.1} 3 0.886792 0.884615 0.905660 0.884615 0.862745 0.877358 0.000599 0.000528 0.017491 0.003421
3 0.002277 0.000545 0.910828 0.920356 1 {u'C': 1} 2 0.886792 0.932692 0.962264 0.903846 0.882353 0.924528 0.000724 0.000129 0.036763 0.012140
4 0.001934 0.000510 0.949045 0.949020 10 {u'C': 10} 1 0.943396 0.980769 1.000000 0.913462 0.901961 0.952830 0.000550 0.000198 0.040093 0.027610
In [26]:
print gs.best_params_
print gs.best_score_
{'C': 10}
0.949044585987
In [27]:
lr_cls = gs.best_estimator_
score = cross_val_score(lr_cls, X_train, y_train, cv = 10, scoring = 'accuracy')
print np.mean(score)
0.942679738562
In [28]:
evaluate(y_test, lr_cls.predict(X_test))
Accuracy: 0.9811
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        16
          1       1.00      0.94      0.97        18
          2       1.00      1.00      1.00        19

avg / total       0.98      0.98      0.98        53

高斯混合模型

建立模型

在本案例中,我们使用Python语言实现高斯混合模型。我们把实现过程分为两个部分

  • 计算Expectation步

在这一步,我们计算每条数据记录$x_{n}$分属K个标签的概率$\gamma_{n}(z_{k}) = p(z_{k} = 1 \vert x_{n}, \theta)$。

In [33]:
'''expectation_step函数

输入参数:X: 数据集
        pis: p(z = 1)的概率
        mus: 高斯分布的均值
        sigmas: 高斯分布的协方差

返回参数: $x_{n}$分属K个标签的概率矩阵

'''
def expectation_step(X, pis, mus, sigmas):
    
    import numpy as np
    from scipy.stats import multivariate_normal
    
    gammas  = np.array([value[0]*multivariate_normal(mean = value[1], cov = value[2]).pdf(X) for value in zip(pis, mus, sigmas)])
    return gammas / gammas.sum(axis = 0)
  • 计算Maximization步

在这一步,我们以完全数据的对数极大似然函数为目标,更新参数pis, mus, sigmas

In [34]:
'''maximization_step函数

输入参数:X: 数据集
        gammas:$x_{n}$分属K个标签的概率矩阵
        pis: p(z = 1)的概率
        mus: 高斯分布的均值
        sigmas: 高斯分布的协方差
'''

def maximization_step(X, gammas, pis, mus, sigmas):
    
    # 更新pis
    pis = gammas.sum(axis = 1)  / gammas.shape[1]

    
    # 更新mus
    mus = np.multiply(1.0/gammas.sum(axis = 1).reshape(-1, 1), np.dot(gammas, X))
    
    
    # 更新sigmas
    
    for k in range(len(sigmas)):
      
            
        centered_X = X - mus[k]
            
        sigmas[k] = np.dot(np.multiply(gammas[k], centered_X.T), centered_X) / gammas[k].sum()
        

更新最大似然函数的下界lower_bound

In [35]:
'''cal_lower_bound函数

输入参数:X: 数据集
        gammas:$x_{n}$分属K个标签的概率矩阵
        pis: p(z = 1)的概率
        mus: 训练得到的高斯分布的均值
        sigmas: 训练得到的高斯分布的协方差

返回参数:对数最大似然函数的下界

'''

def cal_lower_bound(X, gammas, pis, mus, sigmas):
    
    from scipy.stats import multivariate_normal

    lower_bound = 0
        
    for gamma, pi, mu, sigma in zip(gammas, pis, mus, sigmas):
    
        lower_bound  += np.multiply(gamma, np.log(pi*multivariate_normal(mu, sigma).pdf(X)))

    return lower_bound.sum()

接下来,我们实现train_GMM函数,计算各个高斯分布的统计特征。

In [36]:
'''train_GMM函数

输入参数:X: 数据集
        k: 高斯分布的个数
        max_iter: 迭代的最高次数
        alpha: 最大似然函数下界收敛的阈值
        
        
返回参数:(mus, sigmas)以元组形式返回参数
        mus: 高斯分布的均值 K times d
        sigmas:高斯分布的协方差 d times d
        
'''
def train_GMM(X, K, max_iter, alpha):

    indicator = 0
    lower_bound = 0
   
    # 初始化
    N, d = X.shape
    d = d - 1

    # 初始化pi,一维数组,即 K times 1的向量
    pis = np.random.random(K)
    pis /= pis.sum()

    # 初始化mu,二维数组,即 K times d的矩阵
    #mus = np.random.random((K, d))
    mus = np.array([X[X['label'] == num].mean()[:-1] for num in range(K)])

    # 初始化sigmas  三维数组, 即长度为K,元素为 d times d 矩阵的数组
    sigmas = np.array([np.eye(d)] * K)
    
    X_hat = X.iloc[:,:-1].values
    while indicator <= max_iter:
    
        lower_bound_old = lower_bound
    
        # E-step
        gammas = expectation_step(X_hat, pis, mus, sigmas)
        
        # M-step
        maximization_step(X_hat, gammas, pis, mus, sigmas)
        
        # Update 下界
        lower_bound = cal_lower_bound(X_hat, gammas, pis, mus, sigmas)
    
        # 检查终止条件
        if abs(lower_bound - lower_bound_old) < alpha:
            print 'reach stopping criterion!'
            break
    
        # 更新迭代次数
        indicator += 1
    return (mus, sigmas)

计算高斯分布的参数之后,我们需要判定样本来自于各个分布的概率大小。也就是说将样本代入分布的密度函数,计算其密度,取最大值所在的分布作为该样本的总体分布,并将样本归属于样本所在的类。

In [37]:
'''predict函数

输入参数: X:数据集
        mus: 各个高斯分布的均值  K time d
        sigmas: 各个高斯分布的协方差   K times 1 元素为d times d 矩阵

输出参数:数据集的各个数据记录所对应的标签(如0,1,2)

'''

def predict(X, mus, sigmas):
    
    from scipy.stats import multivariate_normal

    labels = np.zeros(X.shape[0])
    
    for index, item in enumerate(X.iloc[:,:-1].values):
    
        probs = [multivariate_normal(mu, sigma).pdf(item) for mu, sigma in zip(mus, sigmas)]
    
        labels[index] = probs.index(max(probs))
        
    return labels

算法验证

我们人为生成300个分别来自3个不同高斯分布的数据样本,组成验证数据集。

In [38]:
# 生成来自高斯混合分布的数据
# 数据集来自3个高斯分布,每个分布产生100个数据样本,数据集的维度为2
#np.random.seed(5)

mus = [[0, 4], [-2, 0], [4, 4]]

sigs = [[[3, 0], [0, 0.5]], [[1, 0], [0, 2]], [[1, 0], [0,1]]]


# 数据集X
multi_data = [np.random.multivariate_normal(mu, sig, 100) for mu, sig in zip(mus, sigs)]

data = np.vstack(multi_data)

## 数据集的类别标签为0,1,2
labels = [[value]*100 for value in range(3)]
## 把嵌套数组labels拉平
multi_labels = [y for x in labels for y in x]


# 数据集标签合并
final_data = []

for v1, v2 in zip(data, multi_labels):
    list_v1 = v1.tolist()
    list_v1.append(v2)
    final_data.append(list_v1) 

##np.random.shuffle(final_data)

把数据集转化为Pandas中的DataFrame格式。

In [39]:
final_data = pd.DataFrame(final_data, columns = ['x1', 'x2', 'label'])
final_data.head()
Out[39]:
x1 x2 label
0 -3.863865 5.011869 0
1 1.742150 3.624853 0
2 0.435850 5.268131 0
3 3.613564 3.530604 0
4 0.916746 4.839165 0

测试数据集准备好之后,我们把final_data代入函数中进行计算。

In [40]:
mus, sigmas = train_GMM(final_data, 3, 1000, 1e-5)
reach stopping criterion!
In [41]:
# 根据训练集生成的均值
mus
Out[41]:
array([[ 0.091542  ,  3.98930488],
       [-2.07339681, -0.05747473],
       [ 3.90201609,  4.13091042]])
In [42]:
# 根据训练集生成的协方差
sigmas
Out[42]:
array([[[ 3.05429524, -0.14215967],
        [-0.14215967,  0.52348837]],

       [[ 1.11471389,  0.08816557],
        [ 0.08816557,  1.88432252]],

       [[ 1.20274147, -0.02181217],
        [-0.02181217,  1.09511199]]])
In [43]:
# 在训练集上预测的标签

label_pred = predict(final_data, mus, sigmas)
print label_pred
[ 0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  2.  2.  0.  0.  0.  0.
  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.
  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  2.  2.  0.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]
In [44]:
## 模型评价
evaluate(label_pred, final_data['label'].values)
Accuracy: 0.9567
             precision    recall  f1-score   support

          0       0.97      0.90      0.93       100
          1       1.00      0.98      0.99       100
          2       0.91      0.99      0.95       100

avg / total       0.96      0.96      0.96       300

使用sklearnGaussian函数

In [45]:
from sklearn.mixture import GaussianMixture
In [46]:
gmm_model = GaussianMixture(n_components = 3, covariance_type = 'diag', init_params='random', max_iter=200)

gmm_model.means_init = np.array([final_data[final_data['label'] == num].mean(axis = 0)[:-1] for num in range(3)])

gmm_model.fit(final_data.iloc[:,:-1])

y_pred = gmm_model.predict(final_data.iloc[:,:-1])



diff = y_pred  - final_data['label'].values

evaluate(y_pred, final_data['label'].values)
Accuracy: 0.9567
             precision    recall  f1-score   support

          0       0.95      0.92      0.93       100
          1       1.00      0.97      0.98       100
          2       0.92      0.98      0.95       100

avg / total       0.96      0.96      0.96       300

接下来,我们查看训练得到的均值和协方差矩阵。

In [47]:
gmm_model.means_
Out[47]:
array([[ 0.14315648,  3.99572962],
       [-2.08035331, -0.11343775],
       [ 3.8482867 ,  4.0898108 ]])
In [48]:
gmm_model.covariances_
Out[48]:
array([[ 4.01553565,  0.61132795],
       [ 1.0545162 ,  1.44822495],
       [ 1.01247237,  1.14685908]])

真实数据集

使用sklearn中的GaussianMixture模型,查看数据集是否适合使用高斯混合模型。

In [66]:
gmm_model = GaussianMixture(n_components = 3, covariance_type = 'diag', init_params='random', max_iter=200)

gmm_model.means_init = np.array([data[data['label'] == num].mean(axis = 0)[:-1] for num in range(3)])

gmm_model.fit(data.iloc[:,:-1])

y_pred = gmm_model.predict(data.iloc[:,:-1])


diff = y_pred  - data['label'].values

evaluate(y_pred, data['label'].values)
Accuracy: 0.8762
             precision    recall  f1-score   support

          0       0.84      0.77      0.81        70
          1       0.97      0.86      0.91        70
          2       0.83      1.00      0.91        70

avg / total       0.88      0.88      0.87       210

准确率接近90%,数据集可以使用高斯混合模型来进行分类,也就是认为数据集的各个部分服从高斯分布。接下来,我们使用自定义的模型,去拟合数据集。

首先,使用数据集训练模型。我们将模型中高斯分布的个数设为3,迭代次数设为2000,终止条件的阈值设为1e-5

In [71]:
mus, sigmas = train_GMM(data, 3, 2000, 1e-5)

train_GMM()函数返回各个高斯分布component的参数值。

In [74]:
## 均值
mus
Out[74]:
array([[ 14.33442857,  14.29428571,   0.88007   ,   5.50805714,
          3.24462857,   2.66740286,   5.08721429],
       [ 18.33428571,  16.13571429,   0.88351714,   6.14802857,
          3.67741429,   3.6448    ,   6.0206    ],
       [ 11.87385714,  13.24785714,   0.84940857,   5.22951429,
          2.85377143,   4.7884    ,   5.1164    ]])
In [75]:
## 协方差矩阵
sigmas
Out[75]:
array([[[  9.61042610e-01,   4.49091020e-01,   3.64855457e-03,
           1.56014211e-01,   1.17141015e-01,  -1.27065732e-02,
           1.88899096e-01],
        [  4.49091020e-01,   2.25678067e-01,  -2.32360248e-04,
           8.79441594e-02,   4.61847713e-02,  -1.09081099e-02,
           1.05092971e-01],
        [  3.64855457e-03,  -2.32360248e-04,   2.53796176e-04,
          -1.24553446e-03,   1.48731511e-03,   1.09097179e-03,
          -1.30512355e-03],
        [  1.56014211e-01,   8.79441594e-02,  -1.24553446e-03,
           4.29723198e-02,   9.93728431e-03,   3.18456598e-03,
           4.96169562e-02],
        [  1.17141015e-01,   4.61847713e-02,   1.48731511e-03,
           9.93728431e-03,   2.07285674e-02,   1.02206141e-03,
           1.19468383e-02],
        [ -1.27065732e-02,  -1.09081099e-02,   1.09097179e-03,
           3.18456598e-03,   1.02206141e-03,   1.49197054e+00,
           4.91593961e-02],
        [  1.88899096e-01,   1.05092971e-01,  -1.30512355e-03,
           4.96169562e-02,   1.19468383e-02,   4.91593961e-02,
           8.22456169e-02]],

       [[  1.57851092e+00,   6.62340210e-01,   3.56424248e-03,
           2.56042627e-01,   1.71127913e-01,   7.63942476e-04,
           2.28768249e-01],
        [  6.62340210e-01,   2.95883847e-01,  -4.75500505e-04,
           1.23587613e-01,   6.15031906e-02,   1.24584162e-02,
           1.12445553e-01],
        [  3.56424248e-03,  -4.75500505e-04,   2.26310981e-04,
          -1.18668938e-03,   1.51843470e-03,  -1.93389914e-03,
          -1.28036605e-03],
        [  2.56042627e-01,   1.23587613e-01,  -1.18668938e-03,
           6.34684521e-02,   1.76146078e-02,  -8.11197909e-03,
           5.91468482e-02],
        [  1.71127913e-01,   6.15031906e-02,   1.51843470e-03,
           1.76146078e-02,   2.61011026e-02,   6.85939592e-03,
           1.34256743e-02],
        [  7.63942476e-04,   1.24584162e-02,  -1.93389914e-03,
          -8.11197909e-03,   6.85939592e-03,   1.41923766e+00,
          -1.50619323e-02],
        [  2.28768249e-01,   1.12445553e-01,  -1.28036605e-03,
           5.91468482e-02,   1.34256743e-02,  -1.50619323e-02,
           6.38161865e-02]],

       [[  4.85828865e-01,   2.15529230e-01,   6.98968287e-03,
           5.38062306e-02,   8.22069311e-02,   4.78245130e-02,
           3.23371835e-02],
        [  2.15529230e-01,   1.16762669e-01,   3.92567483e-04,
           3.96924548e-02,   2.63491802e-02,   4.63964515e-02,
           3.34474027e-02],
        [  6.98968287e-03,   3.92567483e-04,   4.48495788e-04,
          -1.26743868e-03,   2.46968621e-03,  -3.04704732e-03,
          -2.00181745e-03],
        [  5.38062306e-02,   3.96924548e-02,  -1.26743868e-03,
           2.01866081e-02,   1.52143114e-03,   3.86411654e-02,
           2.06172222e-02],
        [  8.22069311e-02,   2.63491802e-02,   2.46968621e-03,
           1.52143114e-03,   1.94984539e-02,   1.04909818e-02,
          -3.64911722e-03],
        [  4.78245130e-02,   4.63964515e-02,  -3.04704732e-03,
           3.86411654e-02,   1.04909818e-02,   1.94417811e+00,
           5.69210636e-02],
        [  3.23371835e-02,   3.34474027e-02,  -2.00181745e-03,
           2.06172222e-02,  -3.64911722e-03,   5.69210636e-02,
           3.19963703e-02]]])

预测数据集的标签。

In [76]:
label_pred = predict(data, mus, sigmas)
label_pred
Out[76]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  2.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  2.,  2.,  0.,  0.,
        0.,  0.,  0.,  0.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  0.,  2.,  2.,  2.,  0.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.])
In [73]:
evaluate(label_pred, data['label'].values)
Accuracy: 0.9286
             precision    recall  f1-score   support

          0       0.90      0.89      0.89        70
          1       0.97      0.93      0.95        70
          2       0.92      0.97      0.94        70

avg / total       0.93      0.93      0.93       210