defoptimize_words_dict(self, data, stop_words, threshold): freq_dict = {} for line in data: # 去除停词 + 计算词频 for word in line: if word in stop_words: continue if word notin freq_dict: freq_dict[word] = 1 else: freq_dict[word] += 1 words_list = [] values = sorted(list(set(freq_dict.values())), reverse=True) for w in values: # 通过阈值筛选词表 算法可以优化??? if w < threshold: continue for k, v in freq_dict.items(): if v == w: words_list.append((k, v)) return words_list
deftf_idf(self, data): doc_num = len(data) df = {} for sample in data: for word in set(sample): # 避免一个单词在一个Sample里出现多次 if word notin df: df[word] = 1.0 else: df[word] += 1.0 for word in df: # 计算document frequency 文档总数/单词出现过的文档数 df[word] = log10(doc_num / df[word]) res = {} index = 0 for sample in data: res[index] = {} tf = {} for word in sample: # 计算term frequency 一个样本里单词的频率 if word notin tf: tf[word] = 1.0 else: tf[word] += 1.0 sample_len = len(sample) for word in sample: # 计算tf * idf 一个样本中各单词的tf_idf tf_idf = tf[word] / sample_len * df[word] res[index][word] = tf_idf index += 1
from sklearn.model_selection import train_test_split train_test_split?
[0;31mSignature:[0m [0mtrain_test_split[0m[0;34m([0m[0;34m*[0m[0marrays[0m[0;34m,[0m [0;34m**[0m[0moptions[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Split arrays or matrices into random train and test subsets
Quick utility that wraps input validation and
``next(ShuffleSplit().split(X, y))`` and application to input data
into a single call for splitting (and optionally subsampling) data in a
oneliner.
Read more in the :ref:`User Guide <cross_validation>`.
Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
Allowed inputs are lists, numpy arrays, scipy-sparse
matrices or pandas dataframes.
test_size : float, int or None, optional (default=None)
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. If ``train_size`` is also None, it will
be set to 0.25.
train_size : float, int, or None, (default=None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
the value is automatically set to the complement of the test size.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
shuffle : boolean, optional (default=True)
Whether or not to shuffle the data before splitting. If shuffle=False
then stratify must be None.
stratify : array-like or None (default=None)
If not None, data is split in a stratified fashion, using this as
the class labels.
Returns
-------
splitting : list, length=2 * len(arrays)
List containing train-test split of inputs.
.. versionadded:: 0.16
If the input is sparse, the output will be a
``scipy.sparse.csr_matrix``. Else, output type is the same as the
input type.
Examples
--------
>>> import numpy as np
>>> from sklearn.model_selection import train_test_split
>>> X, y = np.arange(10).reshape((5, 2)), range(5)
>>> X
array([[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9]])
>>> list(y)
[0, 1, 2, 3, 4]
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.33, random_state=42)
...
>>> X_train
array([[4, 5],
[0, 1],
[6, 7]])
>>> y_train
[2, 0, 3]
>>> X_test
array([[2, 3],
[8, 9]])
>>> y_test
[1, 4]
>>> train_test_split(y, shuffle=False)
[[0, 1, 2], [3, 4]]
[0;31mFile:[0m /Applications/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py
[0;31mType:[0m function
from sklearn.model_selection import train_test_split train_test_split?
[0;31mSignature:[0m [0mtrain_test_split[0m[0;34m([0m[0;34m*[0m[0marrays[0m[0;34m,[0m [0;34m**[0m[0moptions[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Split arrays or matrices into random train and test subsets
Quick utility that wraps input validation and
``next(ShuffleSplit().split(X, y))`` and application to input data
into a single call for splitting (and optionally subsampling) data in a
oneliner.
Read more in the :ref:`User Guide <cross_validation>`.
Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
Allowed inputs are lists, numpy arrays, scipy-sparse
matrices or pandas dataframes.
test_size : float, int or None, optional (default=None)
If float, should be between 0.0 and 1.0 and represent the proportion
of the dataset to include in the test split. If int, represents the
absolute number of test samples. If None, the value is set to the
complement of the train size. If ``train_size`` is also None, it will
be set to 0.25.
train_size : float, int, or None, (default=None)
If float, should be between 0.0 and 1.0 and represent the
proportion of the dataset to include in the train split. If
int, represents the absolute number of train samples. If None,
the value is automatically set to the complement of the test size.
random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
shuffle : boolean, optional (default=True)
Whether or not to shuffle the data before splitting. If shuffle=False
then stratify must be None.
stratify : array-like or None (default=None)
If not None, data is split in a stratified fashion, using this as
the class labels.
Returns
-------
splitting : list, length=2 * len(arrays)
List containing train-test split of inputs.
.. versionadded:: 0.16
If the input is sparse, the output will be a
``scipy.sparse.csr_matrix``. Else, output type is the same as the
input type.
Examples
--------
>>> import numpy as np
>>> from sklearn.model_selection import train_test_split
>>> X, y = np.arange(10).reshape((5, 2)), range(5)
>>> X
array([[0, 1],
[2, 3],
[4, 5],
[6, 7],
[8, 9]])
>>> list(y)
[0, 1, 2, 3, 4]
>>> X_train, X_test, y_train, y_test = train_test_split(
... X, y, test_size=0.33, random_state=42)
...
>>> X_train
array([[4, 5],
[0, 1],
[6, 7]])
>>> y_train
[2, 0, 3]
>>> X_test
array([[2, 3],
[8, 9]])
>>> y_test
[1, 4]
>>> train_test_split(y, shuffle=False)
[[0, 1, 2], [3, 4]]
[0;31mFile:[0m /Applications/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py
[0;31mType:[0m function
1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
from sklearn.neighbors import KNeighborsClassifier KNeighborsClassifier?
[0;31mInit signature:[0m
[0mKNeighborsClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m [0mn_neighbors[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0mweights[0m[0;34m=[0m[0;34m'uniform'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0malgorithm[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0mleaf_size[0m[0;34m=[0m[0;36m30[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0mp[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0mmetric[0m[0;34m=[0m[0;34m'minkowski'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0mmetric_params[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Classifier implementing the k-nearest neighbors vote.
Read more in the :ref:`User Guide <classification>`.
Parameters
----------
n_neighbors : int, optional (default = 5)
Number of neighbors to use by default for :meth:`kneighbors` queries.
weights : str or callable, optional (default = 'uniform')
weight function used in prediction. Possible values:
- 'uniform' : uniform weights. All points in each neighborhood
are weighted equally.
- 'distance' : weight points by the inverse of their distance.
in this case, closer neighbors of a query point will have a
greater influence than neighbors which are further away.
- [callable] : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:
- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
based on the values passed to :meth:`fit` method.
Note: fitting on sparse input will override the setting of
this parameter, using brute force.
leaf_size : int, optional (default = 30)
Leaf size passed to BallTree or KDTree. This can affect the
speed of the construction and query, as well as the memory
required to store the tree. The optimal value depends on the
nature of the problem.
p : integer, optional (default = 2)
Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
metric : string or callable, default 'minkowski'
the distance metric to use for the tree. The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. See the documentation of the DistanceMetric class for a
list of available metrics.
If metric is "precomputed", X is assumed to be a distance matrix and
must be square during fit. X may be a :term:`Glossary <sparse graph>`,
in which case only "nonzero" elements may be considered neighbors.
metric_params : dict, optional (default = None)
Additional keyword arguments for the metric function.
n_jobs : int or None, optional (default=None)
The number of parallel jobs to run for neighbors search.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
Doesn't affect :meth:`fit` method.
Attributes
----------
classes_ : array of shape (n_classes,)
Class labels known to the classifier
effective_metric_ : string or callble
The distance metric used. It will be same as the `metric` parameter
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
'minkowski' and `p` parameter set to 2.
effective_metric_params_ : dict
Additional keyword arguments for the metric function. For most metrics
will be same with `metric_params` parameter, but may also contain the
`p` parameter value if the `effective_metric_` attribute is set to
'minkowski'.
outputs_2d_ : bool
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
otherwise True.
Examples
--------
>>> X = [[0], [1], [2], [3]]
>>> y = [0, 0, 1, 1]
>>> from sklearn.neighbors import KNeighborsClassifier
>>> neigh = KNeighborsClassifier(n_neighbors=3)
>>> neigh.fit(X, y)
KNeighborsClassifier(...)
>>> print(neigh.predict([[1.1]]))
[0]
>>> print(neigh.predict_proba([[0.9]]))
[[0.66666667 0.33333333]]
See also
--------
RadiusNeighborsClassifier
KNeighborsRegressor
RadiusNeighborsRegressor
NearestNeighbors
Notes
-----
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
.. warning::
Regarding the Nearest Neighbors algorithms, if it is found that two
neighbors, neighbor `k+1` and `k`, have identical distances
but different labels, the results will depend on the ordering of the
training data.
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
[0;31mFile:[0m /Applications/anaconda3/lib/python3.7/site-packages/sklearn/neighbors/_classification.py
[0;31mType:[0m ABCMeta
[0;31mSubclasses:[0m
numerical = list(set(df.columns) - set(['State', 'International plan', 'Voice mail plan', 'Area code', 'Churn', 'Customer service calls']))
corr = df[numerical].corr() sns.heatmap(corr)
<matplotlib.axes._subplots.AxesSubplot at 0x1388bad50>
上图中,Total day charge 日话费总额 是直接基于 Total day minutes 电话的分钟数 计算得到,它被称为因变量。除了 Total day charege 外,还有 3 个因变量:Total eve charge,Total night charge,Total intl charge。这 4 个因变量并不贡献任何额外信息,我们直接去除。
1 2 3 4
numerical = list(set(numerical) - set(['Total day charge', 'Total eve charge', 'Total night charge', 'Total intl charge']))
corr = df[numerical].corr() sns.heatmap(corr)
<matplotlib.axes._subplots.AxesSubplot at 0x1396822d0>
numerical.append('Customer service calls') print(numerical) fig, axes = plt.subplots(3, 4, figsize=[10, 7]) for index, feat in enumerate(numerical): ax = axes[int(index / 4), index % 4] sns.boxplot(df['Churn'], df[feat], ax=ax) ax.set_xlabel('') ax.set_ylabel(feat) fig.tight_layout()
['Total day minutes', 'Total night minutes', 'Number vmail messages', 'Total eve calls', 'Account length', 'Total intl calls', 'Total eve minutes', 'Total night calls', 'Total day calls', 'Total intl minutes', 'Customer service calls', 'Customer service calls', 'Customer service calls', 'Customer service calls', 'Customer service calls', 'Customer service calls', 'Customer service calls', 'Customer service calls']
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-47-539701aff8ff> in <module>
3 fig, axes = plt.subplots(3, 4, figsize=[10, 7])
4 for index, feat in enumerate(numerical):
----> 5 ax = axes[int(index / 4), index % 4]
6 sns.boxplot(df['Churn'], df[feat], ax=ax)
7 ax.set_xlabel('')
IndexError: index 3 is out of bounds for axis 0 with size 3
上面的图表表明,两组之间分歧最大的分布是这三个变量:Total day minutes 日通话分钟数、Customer service calls 客服呼叫数、Number vmail messages 语音邮件数。在后续的课程中,我们将学习如何使用随机森林(Random Forest)或梯度提升(Gradient Boosting)来判定特征对分类的重要性,届时可以清晰地看到,前两个特征对于离网预测模型而言确实非常重要。
创建箱型图和提琴形图,查看忠实客户和不忠实客户的日通话分钟数。
1 2 3 4 5
_, axes = plt.subplots(2, 2, sharex=True, sharey=True, figsize=[10, 8]) sns.boxplot(x='Churn', y='Total day minutes', data=df, ax=axes[0][0]) sns.violinplot(x='Churn', y="Total day minutes", data=df, ax=axes[0][1]) sns.boxplot(x='Churn', y='Total night minutes', data=df, ax=axes[1][0]) sns.violinplot(x='Churn', y="Total night minutes", data=df, ax=axes[1][1])
<matplotlib.axes._subplots.AxesSubplot at 0x140f70290>