




>>> from sklearn import preprocessing
>>> import numpy as np
>>> X = np.array([[1.,-1.,2],[2.,0.,0.],[0.,1.,-1.]])
>>> X_scaled = preprocessing.scale(X)
>>> X_scaled
array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])


>>> X = [[1.,-1.,2.],[2.,0.,0.],[0.,1.,-1.]]
>>> X_normalized = preprocessing.normalize(X, norm='l2')
>>> X_normalized
array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])


>>> X_train = np.array([[1.,-1.,2.],[2.,0., 0.],[0.,-1.,-1.]])
>>> min_max_scaler = preprocessing.MinMaxScaler()
>>> X_train_minmax = min_max_scaler.fit_transform(X_train)
>>> X_train_minmax
array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  1.        ,  0.33333333],
       [ 0.        ,  0.        ,  0.        ]])



>>> measurements=[{'city':'Dubai','temperature':33.},
...     {'city':'London','temperature':12.},
...     {'city':'San Fransisco','temperature':18.}
... ]

键值city具有多个取值,“Dubai”、“London”和“San Fransisco”,直接把每个取值作为新的特征即可。键值temperature是数值型,可以直接作为特征使用。

>>> from sklearn.feature_extraction import DictVectorizer
>>> vec = DictVectorizer()
>>> vec.fit_transform(measurements).toarray()
array([[  1.,   0.,   0.,  33.],
       [  0.,   1.,   0.,  12.],
       [  0.,   0.,   1.,  18.]])
>>> vec.get_feature_names()
['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']


  • 词集模型:单词构成的集合,集合中每个元素都只有一个,即词集中的每个单词都只有一个。
  • 词袋模型:如果一个单词在文档中出现不止一次,并统计其出现的次数(频数)。



>>> from sklearn.feature_extraction.text import CountVectorizer


>>> vectorizer = CountVectorizer(min_df=1)
>>> vectorizer
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


>>> corpus = [
...     'This is the first document.',
...     'This is the second second document.',
...     'And the third one.',
...     'Is this the first document?']
>>> X = vectorizer.fit_transform(corpus)
>>> X
<4x9 sparse matrix of type '<type 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>


>>> vectorizer.get_feature_names() == (
...     ['and', 'document', 'first', 'is', 'one',
...      'second', 'the', 'third', 'this'])


>>> X.toarray()
array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])


>>> vocabulary=vectorizer.vocabulary_
>>> vocabulary
{u'and': 0, u'third': 7, u'this': 8, u'is': 3, u'one': 4, u'second': 5, u'the': 6, u'document': 1, u'first': 2}


>>> new_vectorizer
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        vocabulary={u'and': 0, u'third': 7, u'this': 8, u'is': 3, u'one': 4, u'second': 5, u'the': 6, u'document': 1, u'first': 2})


from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.contrib import learn
X_train = np.array(list(vocab_processor.fit_transform(X_train)))
X_test = np.array(list(vocab_processor.transform(X_test)))




>>> import tensorflow as tf
>>> import numpy as np


>>> training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
... filename="iris_training.csv",
... target_dtype=np.int,
... features_dtype=np.float32)
>>> feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
>>> x=training_set.data
>>> y=training_set.target
>>> x
array([[ 6.4000001 ,  2.79999995,  5.5999999 ,  2.20000005],
       [ 5.        ,  2.29999995,  3.29999995,  1.        ],
       [ 4.9000001 ,  2.5       ,  4.5       ,  1.70000005],
       [ 4.9000001 ,  3.0999999 ,  1.5       ,  0.1       ],
       [ 5.69999981,  3.79999995,  1.70000005,  0.30000001],
       [ 4.4000001 ,  3.20000005,  1.29999995,  0.2       ],
       [ 5.4000001 ,  3.4000001 ,  1.5       ,  0.40000001],
       [ 6.9000001 ,  3.0999999 ,  5.0999999 ,  2.29999995],
       [ 6.69999981,  3.0999999 ,  4.4000001 ,  1.39999998],
       [ 5.0999999 ,  3.70000005,  1.5       ,  0.40000001],
       [ 5.19999981,  2.70000005,  3.9000001 ,  1.39999998],
       [ 6.9000001 ,  3.0999999 ,  4.9000001 ,  1.5       ],
       [ 5.80000019,  4.        ,  1.20000005,  0.2       ],
       [ 5.4000001 ,  3.9000001 ,  1.70000005,  0.40000001],
       [ 7.69999981,  3.79999995,  6.69999981,  2.20000005],
       [ 6.30000019,  3.29999995,  4.69999981,  1.60000002],
       [ 6.80000019,  3.20000005,  5.9000001 ,  2.29999995],
       [ 7.5999999 ,  3.        ,  6.5999999 ,  2.0999999 ],
       [ 6.4000001 ,  3.20000005,  5.30000019,  2.29999995],
       [ 5.69999981,  4.4000001 ,  1.5       ,  0.40000001],
       [ 6.69999981,  3.29999995,  5.69999981,  2.0999999 ],
       [ 6.4000001 ,  2.79999995,  5.5999999 ,  2.0999999 ],
       [ 5.4000001 ,  3.9000001 ,  1.29999995,  0.40000001],
       [ 6.0999999 ,  2.5999999 ,  5.5999999 ,  1.39999998],
       [ 7.19999981,  3.        ,  5.80000019,  1.60000002],
       [ 5.19999981,  3.5       ,  1.5       ,  0.2       ],
       [ 5.80000019,  2.5999999 ,  4.        ,  1.20000005],
       [ 5.9000001 ,  3.        ,  5.0999999 ,  1.79999995],
       [ 5.4000001 ,  3.        ,  4.5       ,  1.5       ],
       [ 6.69999981,  3.        ,  5.        ,  1.70000005],
       [ 6.30000019,  2.29999995,  4.4000001 ,  1.29999995],
       [ 5.0999999 ,  2.5       ,  3.        ,  1.10000002],
       [ 6.4000001 ,  3.20000005,  4.5       ,  1.5       ],
       [ 6.80000019,  3.        ,  5.5       ,  2.0999999 ],
       [ 6.19999981,  2.79999995,  4.80000019,  1.79999995],
       [ 6.9000001 ,  3.20000005,  5.69999981,  2.29999995],
       [ 6.5       ,  3.20000005,  5.0999999 ,  2.        ],
       [ 5.80000019,  2.79999995,  5.0999999 ,  2.4000001 ],
       [ 5.0999999 ,  3.79999995,  1.5       ,  0.30000001],
       [ 4.80000019,  3.        ,  1.39999998,  0.30000001],
       [ 7.9000001 ,  3.79999995,  6.4000001 ,  2.        ],
       [ 5.80000019,  2.70000005,  5.0999999 ,  1.89999998],
       [ 6.69999981,  3.        ,  5.19999981,  2.29999995],
       [ 5.0999999 ,  3.79999995,  1.89999998,  0.40000001],
       [ 4.69999981,  3.20000005,  1.60000002,  0.2       ],
       [ 6.        ,  2.20000005,  5.        ,  1.5       ],
       [ 4.80000019,  3.4000001 ,  1.60000002,  0.2       ],
       [ 7.69999981,  2.5999999 ,  6.9000001 ,  2.29999995],
       [ 4.5999999 ,  3.5999999 ,  1.        ,  0.2       ],
       [ 7.19999981,  3.20000005,  6.        ,  1.79999995],
       [ 5.        ,  3.29999995,  1.39999998,  0.2       ],
       [ 6.5999999 ,  3.        ,  4.4000001 ,  1.39999998],
       [ 6.0999999 ,  2.79999995,  4.        ,  1.29999995],
       [ 5.        ,  3.20000005,  1.20000005,  0.2       ],
       [ 7.        ,  3.20000005,  4.69999981,  1.39999998],
       [ 6.        ,  3.        ,  4.80000019,  1.79999995],
       [ 7.4000001 ,  2.79999995,  6.0999999 ,  1.89999998],
       [ 5.80000019,  2.70000005,  5.0999999 ,  1.89999998],
       [ 6.19999981,  3.4000001 ,  5.4000001 ,  2.29999995],
       [ 5.        ,  2.        ,  3.5       ,  1.        ],
       [ 5.5999999 ,  2.5       ,  3.9000001 ,  1.10000002],
       [ 6.69999981,  3.0999999 ,  5.5999999 ,  2.4000001 ],
       [ 6.30000019,  2.5       ,  5.        ,  1.89999998],
       [ 6.4000001 ,  3.0999999 ,  5.5       ,  1.79999995],
       [ 6.19999981,  2.20000005,  4.5       ,  1.5       ],
       [ 7.30000019,  2.9000001 ,  6.30000019,  1.79999995],
       [ 4.4000001 ,  3.        ,  1.29999995,  0.2       ],
       [ 7.19999981,  3.5999999 ,  6.0999999 ,  2.5       ],
       [ 6.5       ,  3.        ,  5.5       ,  1.79999995],
       [ 5.        ,  3.4000001 ,  1.5       ,  0.2       ],
       [ 4.69999981,  3.20000005,  1.29999995,  0.2       ],
       [ 6.5999999 ,  2.9000001 ,  4.5999999 ,  1.29999995],
       [ 5.5       ,  3.5       ,  1.29999995,  0.2       ],
       [ 7.69999981,  3.        ,  6.0999999 ,  2.29999995],
       [ 6.0999999 ,  3.        ,  4.9000001 ,  1.79999995],
       [ 4.9000001 ,  3.0999999 ,  1.5       ,  0.1       ],
       [ 5.5       ,  2.4000001 ,  3.79999995,  1.10000002],
       [ 5.69999981,  2.9000001 ,  4.19999981,  1.29999995],
       [ 6.        ,  2.9000001 ,  4.5       ,  1.5       ],
       [ 6.4000001 ,  2.70000005,  5.30000019,  1.89999998],
       [ 5.4000001 ,  3.70000005,  1.5       ,  0.2       ],
       [ 6.0999999 ,  2.9000001 ,  4.69999981,  1.39999998],
       [ 6.5       ,  2.79999995,  4.5999999 ,  1.5       ],
       [ 5.5999999 ,  2.70000005,  4.19999981,  1.29999995],
       [ 6.30000019,  3.4000001 ,  5.5999999 ,  2.4000001 ],
       [ 4.9000001 ,  3.0999999 ,  1.5       ,  0.1       ],
       [ 6.80000019,  2.79999995,  4.80000019,  1.39999998],
       [ 5.69999981,  2.79999995,  4.5       ,  1.29999995],
       [ 6.        ,  2.70000005,  5.0999999 ,  1.60000002],
       [ 5.        ,  3.5       ,  1.29999995,  0.30000001],
       [ 6.5       ,  3.        ,  5.19999981,  2.        ],
       [ 6.0999999 ,  2.79999995,  4.69999981,  1.20000005],
       [ 5.0999999 ,  3.5       ,  1.39999998,  0.30000001],
       [ 4.5999999 ,  3.0999999 ,  1.5       ,  0.2       ],
       [ 6.5       ,  3.        ,  5.80000019,  2.20000005],
       [ 4.5999999 ,  3.4000001 ,  1.39999998,  0.30000001],
       [ 4.5999999 ,  3.20000005,  1.39999998,  0.2       ],
       [ 7.69999981,  2.79999995,  6.69999981,  2.        ],
       [ 5.9000001 ,  3.20000005,  4.80000019,  1.79999995],
       [ 5.0999999 ,  3.79999995,  1.60000002,  0.2       ],
       [ 4.9000001 ,  3.        ,  1.39999998,  0.2       ],
       [ 4.9000001 ,  2.4000001 ,  3.29999995,  1.        ],
       [ 4.5       ,  2.29999995,  1.29999995,  0.30000001],
       [ 5.80000019,  2.70000005,  4.0999999 ,  1.        ],
       [ 5.        ,  3.4000001 ,  1.60000002,  0.40000001],
       [ 5.19999981,  3.4000001 ,  1.39999998,  0.2       ],
       [ 5.30000019,  3.70000005,  1.5       ,  0.2       ],
       [ 5.        ,  3.5999999 ,  1.39999998,  0.2       ],
       [ 5.5999999 ,  2.9000001 ,  3.5999999 ,  1.29999995],
       [ 4.80000019,  3.0999999 ,  1.60000002,  0.2       ],
       [ 6.30000019,  2.70000005,  4.9000001 ,  1.79999995],
       [ 5.69999981,  2.79999995,  4.0999999 ,  1.29999995],
       [ 5.        ,  3.        ,  1.60000002,  0.2       ],
       [ 6.30000019,  3.29999995,  6.        ,  2.5       ],
       [ 5.        ,  3.5       ,  1.60000002,  0.60000002],
       [ 5.5       ,  2.5999999 ,  4.4000001 ,  1.20000005],
       [ 5.69999981,  3.        ,  4.19999981,  1.20000005],
       [ 4.4000001 ,  2.9000001 ,  1.39999998,  0.2       ],
       [ 4.80000019,  3.        ,  1.39999998,  0.1       ],
       [ 5.5       ,  2.4000001 ,  3.70000005,  1.        ]], dtype=float32)
>>> y
array([2, 1, 2, 0, 0, 0, 0, 2, 1, 0, 1, 1, 0, 0, 2, 1, 2, 2, 2, 0, 2, 2, 0,
       2, 2, 0, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2,
       0, 2, 0, 2, 0, 1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 0, 2, 2,
       0, 0, 1, 0, 2, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 1, 1, 1, 0, 2, 1,
       0, 0, 2, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 2, 1, 0, 2, 0,
       1, 1, 0, 0, 1])


  • filename, 文件名;
  • target_dtype, 标记数据类型;
  • features_dtype, 特征数据类型;





>>> import numpy as np
>>> from sklearn.model_selection import train_test_split
f>>> from sklearn import datasets
>>> from sklearn import svm


>>> iris = datasets.load_iris()
>>> iris.data.shape, iris.target.shape
((150, 4), (150,))


>>> X_train, X_test, y_train, y_test = train_test_split(
...     iris.data, iris.target, test_size=0.4, random_state=0)
>>> X_train.shape,y_train.shape
((90, 4), (90,))
>>> X_test.shape,y_test.shape
((60, 4), (60,))


>>> clf=svm.SVC(kernel='linear',C=1).fit(X_train,y_train)


>>> clf=svm.SVC(kernel='linear',C=1).fit(X_train,y_train)
>>> clf
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


>>> clf.score(X_test, y_test)



>>> from sklearn.model_selection import cross_val_score
>>> clf = svm.SVC(kernel='linear',C=1)
>>> scores = cross_val_score(clf, iris.data, iris.target, cv=5)
>>> scores
array([ 0.96666667,  1.        ,  0.96666667,  0.96666667,  1.        ])