使用FunctionTransformer选择列¶
显示如何在管道中使用功能转换器。 如果您知道数据集的第一主成分与分类任务无关,则可以使用FunctionTransformer选择PCA转换数据以外的所有内容。
输入:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
def _generate_vector(shift=0.5, noise=15):
return np.arange(1000) + (np.random.rand(1000) - shift) * noise
def generate_dataset():
"""
This dataset is two lines with a slope ~ 1, where one has
a y offset of ~100
"""
return np.vstack((
np.vstack((
_generate_vector(),
_generate_vector() + 100,
)).T,
np.vstack((
_generate_vector(),
_generate_vector(),
)).T,
)), np.hstack((np.zeros(1000), np.ones(1000)))
def all_but_first_column(X):
return X[:, 1:]
def drop_first_component(X, y):
"""
Create a pipeline with PCA and the column selector and use it to
transform the dataset.
"""
pipeline = make_pipeline(
PCA(), FunctionTransformer(all_but_first_column),
)
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline.fit(X_train, y_train)
return pipeline.transform(X_test), y_test
if __name__ == '__main__':
X, y = generate_dataset()
lw = 0
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, lw=lw)
plt.figure()
X_transformed, y_transformed = drop_first_component(*generate_dataset())
plt.scatter(
X_transformed[:, 0],
np.zeros(len(X_transformed)),
c=y_transformed,
lw=lw,
s=60
)
plt.show()
输出:
脚本的总运行时间:(0分钟 0.165 秒)。