包版本信息¶

!pip freeze | grep numpy
!pip freeze | grep pandas
!pip freeze | grep scikit-learn

numpy==1.13.1
numpydoc==0.6.0
pandas==0.20.3
scikit-learn==0.19.0

数据预处理¶

import numpy as np
import pandas as pd

## 隐藏warning信息
import warnings
warnings.filterwarnings('ignore')

由于数据集以txt文件存储，使用Pandas的read_table()函数读入数据，分隔符为\t。

data = pd.read_table('seed.txt', sep='\t', names = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7','label'])
data.head()

然后，我们将data['label']的取值减1, 标签取值范围为[0, 1, 2]。

data['label'] = data['label'] - 1
data.head()

data.shape

(210, 8)

# 检查有无缺失值
np.sum(data.isnull())

x1       0
x2       0
x3       0
x4       0
x5       0
x6       0
x7       0
label    0
dtype: int64

# 标签label的取值1，2，3
data['label'].value_counts()

2    70
1    70
0    70
Name: label, dtype: int64

class_label = np.unique(data['label'])
class_label

array([0, 1, 2])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
x1       210 non-null float64
x2       210 non-null float64
x3       210 non-null float64
x4       210 non-null float64
x5       210 non-null float64
x6       210 non-null float64
x7       210 non-null float64
label    210 non-null int64
dtypes: float64(7), int64(1)
memory usage: 13.2 KB

data.describe()

分类建模¶

逻辑回归¶

from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

def evaluate(pred,test_y):
    
    import seaborn as sns
    import matplotlib.pyplot as plt
    import sklearn.metrics as metrics
    # 输出分类的准确率
    print("Accuracy: %.4f"  % (metrics.accuracy_score(test_y,pred)))
    
    # 输出衡量分类效果的各项指标
    print(classification_report(test_y, pred)) 
    
    # 更直观的，我们通过seaborn画出混淆矩阵
    %matplotlib inline
    plt.figure(figsize=(6,4))
    colorMetrics = metrics.confusion_matrix(test_y,pred)
    
    # 坐标y代表test_y，即真实的类别，坐标x代表估计出的类别pred
    sns.heatmap(colorMetrics, annot=True, fmt='d', xticklabels=[1,2,3],yticklabels=[1,2,3])
    sns.plt.show()

## 数据准备

X = data.iloc[:, :-1]
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

lr = LogisticRegression()
param_dict = {'C': [0.001, 0.01, 0.1, 1, 10]}
gs = GridSearchCV(lr, param_grid = param_dict, scoring='accuracy')
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

pd.DataFrame(gs.cv_results_)

print gs.best_params_
print gs.best_score_

{'C': 10}
0.949044585987

lr_cls = gs.best_estimator_
score = cross_val_score(lr_cls, X_train, y_train, cv = 10, scoring = 'accuracy')
print np.mean(score)

0.942679738562

evaluate(y_test, lr_cls.predict(X_test))

Accuracy: 0.9811
             precision    recall  f1-score   support

          0       0.94      1.00      0.97        16
          1       1.00      0.94      0.97        18
          2       1.00      1.00      1.00        19

avg / total       0.98      0.98      0.98        53

高斯混合模型¶

建立模型¶

在本案例中，我们使用Python语言实现高斯混合模型。我们把实现过程分为两个部分

计算Expectation步

在这一步，我们计算每条数据记录$x_{n}$分属K个标签的概率$\gamma_{n}(z_{k}) = p(z_{k} = 1 \vert x_{n}, \theta)$。

'''expectation_step函数

输入参数：X: 数据集
        pis: p(z = 1)的概率
        mus: 高斯分布的均值
        sigmas: 高斯分布的协方差

返回参数： $x_{n}$分属K个标签的概率矩阵

'''
def expectation_step(X, pis, mus, sigmas):
    
    import numpy as np
    from scipy.stats import multivariate_normal
    
    gammas  = np.array([value[0]*multivariate_normal(mean = value[1], cov = value[2]).pdf(X) for value in zip(pis, mus, sigmas)])
    return gammas / gammas.sum(axis = 0)

计算Maximization步

在这一步，我们以完全数据的对数极大似然函数为目标，更新参数pis, mus, sigmas

'''maximization_step函数

输入参数：X: 数据集
        gammas：$x_{n}$分属K个标签的概率矩阵
        pis: p(z = 1)的概率
        mus: 高斯分布的均值
        sigmas: 高斯分布的协方差
'''

def maximization_step(X, gammas, pis, mus, sigmas):
    
    # 更新pis
    pis = gammas.sum(axis = 1)  / gammas.shape[1]

    
    # 更新mus
    mus = np.multiply(1.0/gammas.sum(axis = 1).reshape(-1, 1), np.dot(gammas, X))
    
    
    # 更新sigmas
    
    for k in range(len(sigmas)):
      
            
        centered_X = X - mus[k]
            
        sigmas[k] = np.dot(np.multiply(gammas[k], centered_X.T), centered_X) / gammas[k].sum()

更新最大似然函数的下界lower_bound。

'''cal_lower_bound函数

输入参数：X: 数据集
        gammas：$x_{n}$分属K个标签的概率矩阵
        pis: p(z = 1)的概率
        mus: 训练得到的高斯分布的均值
        sigmas: 训练得到的高斯分布的协方差

返回参数：对数最大似然函数的下界

'''

def cal_lower_bound(X, gammas, pis, mus, sigmas):
    
    from scipy.stats import multivariate_normal

    lower_bound = 0
        
    for gamma, pi, mu, sigma in zip(gammas, pis, mus, sigmas):
    
        lower_bound  += np.multiply(gamma, np.log(pi*multivariate_normal(mu, sigma).pdf(X)))

    return lower_bound.sum()

接下来，我们实现train_GMM函数，计算各个高斯分布的统计特征。

'''train_GMM函数

输入参数：X: 数据集
        k: 高斯分布的个数
        max_iter: 迭代的最高次数
        alpha: 最大似然函数下界收敛的阈值
        
        
返回参数：(mus, sigmas)以元组形式返回参数
        mus: 高斯分布的均值 K times d
        sigmas：高斯分布的协方差 d times d
        
'''
def train_GMM(X, K, max_iter, alpha):

    indicator = 0
    lower_bound = 0
   
    # 初始化
    N, d = X.shape
    d = d - 1

    # 初始化pi，一维数组，即 K times 1的向量
    pis = np.random.random(K)
    pis /= pis.sum()

    # 初始化mu，二维数组，即 K times d的矩阵
    #mus = np.random.random((K, d))
    mus = np.array([X[X['label'] == num].mean()[:-1] for num in range(K)])

    # 初始化sigmas  三维数组， 即长度为K，元素为 d times d 矩阵的数组
    sigmas = np.array([np.eye(d)] * K)
    
    X_hat = X.iloc[:,:-1].values
    while indicator <= max_iter:
    
        lower_bound_old = lower_bound
    
        # E-step
        gammas = expectation_step(X_hat, pis, mus, sigmas)
        
        # M-step
        maximization_step(X_hat, gammas, pis, mus, sigmas)
        
        # Update 下界
        lower_bound = cal_lower_bound(X_hat, gammas, pis, mus, sigmas)
    
        # 检查终止条件
        if abs(lower_bound - lower_bound_old) < alpha:
            print 'reach stopping criterion!'
            break
    
        # 更新迭代次数
        indicator += 1
    return (mus, sigmas)

计算高斯分布的参数之后，我们需要判定样本来自于各个分布的概率大小。也就是说将样本代入分布的密度函数，计算其密度，取最大值所在的分布作为该样本的总体分布，并将样本归属于样本所在的类。

'''predict函数

输入参数： X：数据集
        mus: 各个高斯分布的均值  K time d
        sigmas: 各个高斯分布的协方差   K times 1 元素为d times d 矩阵

输出参数：数据集的各个数据记录所对应的标签(如0,1,2)

'''

def predict(X, mus, sigmas):
    
    from scipy.stats import multivariate_normal

    labels = np.zeros(X.shape[0])
    
    for index, item in enumerate(X.iloc[:,:-1].values):
    
        probs = [multivariate_normal(mu, sigma).pdf(item) for mu, sigma in zip(mus, sigmas)]
    
        labels[index] = probs.index(max(probs))
        
    return labels

算法验证¶

我们人为生成300个分别来自3个不同高斯分布的数据样本，组成验证数据集。

# 生成来自高斯混合分布的数据
# 数据集来自3个高斯分布，每个分布产生100个数据样本，数据集的维度为2
#np.random.seed(5)

mus = [[0, 4], [-2, 0], [4, 4]]

sigs = [[[3, 0], [0, 0.5]], [[1, 0], [0, 2]], [[1, 0], [0,1]]]


# 数据集X
multi_data = [np.random.multivariate_normal(mu, sig, 100) for mu, sig in zip(mus, sigs)]

data = np.vstack(multi_data)

## 数据集的类别标签为0，1，2
labels = [[value]*100 for value in range(3)]
## 把嵌套数组labels拉平
multi_labels = [y for x in labels for y in x]


# 数据集标签合并
final_data = []

for v1, v2 in zip(data, multi_labels):
    list_v1 = v1.tolist()
    list_v1.append(v2)
    final_data.append(list_v1) 

##np.random.shuffle(final_data)

把数据集转化为Pandas中的DataFrame格式。

final_data = pd.DataFrame(final_data, columns = ['x1', 'x2', 'label'])
final_data.head()

测试数据集准备好之后，我们把final_data代入函数中进行计算。

mus, sigmas = train_GMM(final_data, 3, 1000, 1e-5)

reach stopping criterion!

# 根据训练集生成的均值
mus

array([[ 0.091542  ,  3.98930488],
       [-2.07339681, -0.05747473],
       [ 3.90201609,  4.13091042]])

# 根据训练集生成的协方差
sigmas

array([[[ 3.05429524, -0.14215967],
        [-0.14215967,  0.52348837]],

       [[ 1.11471389,  0.08816557],
        [ 0.08816557,  1.88432252]],

       [[ 1.20274147, -0.02181217],
        [-0.02181217,  1.09511199]]])

# 在训练集上预测的标签

label_pred = predict(final_data, mus, sigmas)
print label_pred

[ 0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  2.  2.  0.  0.  0.  0.
  0.  0.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.  0.  2.  0.  0.  0.  0.
  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  2.  2.  0.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]

## 模型评价
evaluate(label_pred, final_data['label'].values)

Accuracy: 0.9567
             precision    recall  f1-score   support

          0       0.97      0.90      0.93       100
          1       1.00      0.98      0.99       100
          2       0.91      0.99      0.95       100

avg / total       0.96      0.96      0.96       300

使用`sklearn`的`Gaussian`函数¶

from sklearn.mixture import GaussianMixture

gmm_model = GaussianMixture(n_components = 3, covariance_type = 'diag', init_params='random', max_iter=200)

gmm_model.means_init = np.array([final_data[final_data['label'] == num].mean(axis = 0)[:-1] for num in range(3)])

gmm_model.fit(final_data.iloc[:,:-1])

y_pred = gmm_model.predict(final_data.iloc[:,:-1])



diff = y_pred  - final_data['label'].values

evaluate(y_pred, final_data['label'].values)

Accuracy: 0.9567
             precision    recall  f1-score   support

          0       0.95      0.92      0.93       100
          1       1.00      0.97      0.98       100
          2       0.92      0.98      0.95       100

avg / total       0.96      0.96      0.96       300

接下来，我们查看训练得到的均值和协方差矩阵。

gmm_model.means_

array([[ 0.14315648,  3.99572962],
       [-2.08035331, -0.11343775],
       [ 3.8482867 ,  4.0898108 ]])

gmm_model.covariances_

array([[ 4.01553565,  0.61132795],
       [ 1.0545162 ,  1.44822495],
       [ 1.01247237,  1.14685908]])

真实数据集¶

使用sklearn中的GaussianMixture模型，查看数据集是否适合使用高斯混合模型。

gmm_model = GaussianMixture(n_components = 3, covariance_type = 'diag', init_params='random', max_iter=200)

gmm_model.means_init = np.array([data[data['label'] == num].mean(axis = 0)[:-1] for num in range(3)])

gmm_model.fit(data.iloc[:,:-1])

y_pred = gmm_model.predict(data.iloc[:,:-1])


diff = y_pred  - data['label'].values

evaluate(y_pred, data['label'].values)

Accuracy: 0.8762
             precision    recall  f1-score   support

          0       0.84      0.77      0.81        70
          1       0.97      0.86      0.91        70
          2       0.83      1.00      0.91        70

avg / total       0.88      0.88      0.87       210

准确率接近90%，数据集可以使用高斯混合模型来进行分类，也就是认为数据集的各个部分服从高斯分布。接下来，我们使用自定义的模型，去拟合数据集。

首先，使用数据集训练模型。我们将模型中高斯分布的个数设为3，迭代次数设为2000，终止条件的阈值设为1e-5。

mus, sigmas = train_GMM(data, 3, 2000, 1e-5)

train_GMM()函数返回各个高斯分布component的参数值。

## 均值
mus

array([[ 14.33442857,  14.29428571,   0.88007   ,   5.50805714,
          3.24462857,   2.66740286,   5.08721429],
       [ 18.33428571,  16.13571429,   0.88351714,   6.14802857,
          3.67741429,   3.6448    ,   6.0206    ],
       [ 11.87385714,  13.24785714,   0.84940857,   5.22951429,
          2.85377143,   4.7884    ,   5.1164    ]])

## 协方差矩阵
sigmas

array([[[  9.61042610e-01,   4.49091020e-01,   3.64855457e-03,
           1.56014211e-01,   1.17141015e-01,  -1.27065732e-02,
           1.88899096e-01],
        [  4.49091020e-01,   2.25678067e-01,  -2.32360248e-04,
           8.79441594e-02,   4.61847713e-02,  -1.09081099e-02,
           1.05092971e-01],
        [  3.64855457e-03,  -2.32360248e-04,   2.53796176e-04,
          -1.24553446e-03,   1.48731511e-03,   1.09097179e-03,
          -1.30512355e-03],
        [  1.56014211e-01,   8.79441594e-02,  -1.24553446e-03,
           4.29723198e-02,   9.93728431e-03,   3.18456598e-03,
           4.96169562e-02],
        [  1.17141015e-01,   4.61847713e-02,   1.48731511e-03,
           9.93728431e-03,   2.07285674e-02,   1.02206141e-03,
           1.19468383e-02],
        [ -1.27065732e-02,  -1.09081099e-02,   1.09097179e-03,
           3.18456598e-03,   1.02206141e-03,   1.49197054e+00,
           4.91593961e-02],
        [  1.88899096e-01,   1.05092971e-01,  -1.30512355e-03,
           4.96169562e-02,   1.19468383e-02,   4.91593961e-02,
           8.22456169e-02]],

       [[  1.57851092e+00,   6.62340210e-01,   3.56424248e-03,
           2.56042627e-01,   1.71127913e-01,   7.63942476e-04,
           2.28768249e-01],
        [  6.62340210e-01,   2.95883847e-01,  -4.75500505e-04,
           1.23587613e-01,   6.15031906e-02,   1.24584162e-02,
           1.12445553e-01],
        [  3.56424248e-03,  -4.75500505e-04,   2.26310981e-04,
          -1.18668938e-03,   1.51843470e-03,  -1.93389914e-03,
          -1.28036605e-03],
        [  2.56042627e-01,   1.23587613e-01,  -1.18668938e-03,
           6.34684521e-02,   1.76146078e-02,  -8.11197909e-03,
           5.91468482e-02],
        [  1.71127913e-01,   6.15031906e-02,   1.51843470e-03,
           1.76146078e-02,   2.61011026e-02,   6.85939592e-03,
           1.34256743e-02],
        [  7.63942476e-04,   1.24584162e-02,  -1.93389914e-03,
          -8.11197909e-03,   6.85939592e-03,   1.41923766e+00,
          -1.50619323e-02],
        [  2.28768249e-01,   1.12445553e-01,  -1.28036605e-03,
           5.91468482e-02,   1.34256743e-02,  -1.50619323e-02,
           6.38161865e-02]],

       [[  4.85828865e-01,   2.15529230e-01,   6.98968287e-03,
           5.38062306e-02,   8.22069311e-02,   4.78245130e-02,
           3.23371835e-02],
        [  2.15529230e-01,   1.16762669e-01,   3.92567483e-04,
           3.96924548e-02,   2.63491802e-02,   4.63964515e-02,
           3.34474027e-02],
        [  6.98968287e-03,   3.92567483e-04,   4.48495788e-04,
          -1.26743868e-03,   2.46968621e-03,  -3.04704732e-03,
          -2.00181745e-03],
        [  5.38062306e-02,   3.96924548e-02,  -1.26743868e-03,
           2.01866081e-02,   1.52143114e-03,   3.86411654e-02,
           2.06172222e-02],
        [  8.22069311e-02,   2.63491802e-02,   2.46968621e-03,
           1.52143114e-03,   1.94984539e-02,   1.04909818e-02,
          -3.64911722e-03],
        [  4.78245130e-02,   4.63964515e-02,  -3.04704732e-03,
           3.86411654e-02,   1.04909818e-02,   1.94417811e+00,
           5.69210636e-02],
        [  3.23371835e-02,   3.34474027e-02,  -2.00181745e-03,
           2.06172222e-02,  -3.64911722e-03,   5.69210636e-02,
           3.19963703e-02]]])

预测数据集的标签。

label_pred = predict(data, mus, sigmas)
label_pred

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  2.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  2.,  2.,  0.,  0.,
        0.,  0.,  0.,  0.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  0.,  2.,  2.,  2.,  0.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.])

evaluate(label_pred, data['label'].values)

Accuracy: 0.9286
             precision    recall  f1-score   support

          0       0.90      0.89      0.89        70
          1       0.97      0.93      0.95        70
          2       0.92      0.97      0.94        70

avg / total       0.93      0.93      0.93       210

	x1	x2	x3	x4	x5	x6	x7	label
0	15.26	14.84	0.8710	5.763	3.312	2.221	5.220	1
1	14.88	14.57	0.8811	5.554	3.333	1.018	4.956	1
2	14.29	14.09	0.9050	5.291	3.337	2.699	4.825	1
3	13.84	13.94	0.8955	5.324	3.379	2.259	4.805	1
4	16.14	14.99	0.9034	5.658	3.562	1.355	5.175	1

	x1	x2	x3	x4	x5	x6	x7
0	15.26	14.84	0.8710	5.763	3.312	2.221	5.220
1	14.88	14.57	0.8811	5.554	3.333	1.018	4.956
2	14.29	14.09	0.9050	5.291	3.337	2.699	4.825
3	13.84	13.94	0.8955	5.324	3.379	2.259	4.805
4	16.14	14.99	0.9034	5.658	3.562	1.355	5.175

	x1	x2	x3	x4	x5	x6	x7	label
count	210.000000	210.000000	210.000000	210.000000	210.000000	210.000000	210.000000	210.000000
mean	14.847524	14.559286	0.870999	5.628533	3.258605	3.700201	5.408071	1.000000
std	2.909699	1.305959	0.023629	0.443063	0.377714	1.503557	0.491480	0.818448
min	10.590000	12.410000	0.808100	4.899000	2.630000	0.765100	4.519000	0.000000
25%	12.270000	13.450000	0.856900	5.262250	2.944000	2.561500	5.045000	0.000000
50%	14.355000	14.320000	0.873450	5.523500	3.237000	3.599000	5.223000	1.000000
75%	17.305000	15.715000	0.887775	5.979750	3.561750	4.768750	5.877000	2.000000
max	21.180000	17.250000	0.918300	6.675000	4.033000	8.456000	6.550000	2.000000

	mean_fit_time	mean_score_time	mean_test_score	mean_train_score	param_C	params	rank_test_score	split0_test_score	split0_train_score	split1_test_score	split1_train_score	split2_test_score	split2_train_score	std_fit_time	std_score_time	std_test_score	std_train_score
0	0.015219	0.002470	0.382166	0.381955	0.001	{u'C': 0.001}	5	0.415094	0.355769	0.377358	0.375000	0.352941	0.415094	0.019821	0.003011	0.025520	0.024714
1	0.001227	0.000423	0.783439	0.764393	0.01	{u'C': 0.01}	4	0.754717	0.778846	0.867925	0.759615	0.725490	0.754717	0.000030	0.000113	0.061473	0.010414
2	0.002254	0.000865	0.885350	0.882196	0.1	{u'C': 0.1}	3	0.886792	0.884615	0.905660	0.884615	0.862745	0.877358	0.000599	0.000528	0.017491	0.003421
3	0.002277	0.000545	0.910828	0.920356	1	{u'C': 1}	2	0.886792	0.932692	0.962264	0.903846	0.882353	0.924528	0.000724	0.000129	0.036763	0.012140
4	0.001934	0.000510	0.949045	0.949020	10	{u'C': 10}	1	0.943396	0.980769	1.000000	0.913462	0.901961	0.952830	0.000550	0.000198	0.040093	0.027610

	x1	x2
0	-3.863865	5.011869
1	1.742150	3.624853
2	0.435850	5.268131
3	3.613564	3.530604
4	0.916746	4.839165