如何使用特征提取技术降低数据集维度,建议收藏!
点击上方 "码农真经" 关注,星标或者置顶
22点24分准时推送,第一时间送达
来源:公众号 读芯术 | 编辑:真经君
图源: https://blog.datasciencedojo.com/curse-of-dimensionality-python/)
简介
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
X = df.drop(['class'], axis=1)
Y = df['class']
X = pd.get_dummies(X, prefix_sep='_')
Y = LabelEncoder().fit_transform(Y)
X = StandardScaler().fit_transform(X)
defforest_test(X, Y):
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y,
test_size=0.30,
random_state=101)
start = time.process_time()
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Train,Y_Train)
print(time.process_time() - start)
predictionforest = trainedforest.predict(X_Test)
print(confusion_matrix(Y_Test,predictionforest))
print(classification_report(Y_Test,predictionforest))
forest_test(X, Y)
[ 0 1164]]
precision recall f1-score support
0 1.00 1.00 1.00 1274
1 1.00 1.00 1.00 1164
accuracy 1.00 2438
macro avg 1.00 1.00 1.00 2438
weighted avg 1.00 1.00 1.00 2438
特征提取
from sklearn.decomposition importPCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
PCA_df= pd.DataFrame(data= X_pca, columns= ['PC1', 'PC2'])
PCA_df= pd.concat([PCA_df, df['class']], axis=1)
PCA_df['class'] = LabelEncoder().fit_transform(PCA_df['class'])
PCA_df.head()
figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='k')
classes = [1, 0]
colors = ['r', 'b']
for clas, color inzip(classes, colors):
plt.scatter(PCA_df.loc[PCA_df['class'] == clas, 'PC1'],
PCA_df.loc[PCA_df['class'] == clas, 'PC2'],
c= color)
plt.xlabel('Principal Component 1', fontsize=12)
plt.ylabel('Principal Component 2', fontsize=12)
plt.title('2D PCA', fontsize=15)
plt.legend(['Poisonous', 'Edible'])
plt.grid()
pca = PCA(n_components=3,svd_solver='full')
X_pca = pca.fit_transform(X)
print(pca.explained_variance_)
forest_test(X_pca, Y)
2.769664902999999
[[1261 13]
[ 41 1123]]
precision recall f1-score support
0 0.97 0.99 0.98 1274
1 0.99 0.96 0.98 1164
accuracy 0.98 2438
macro avg 0.98 0.98 0.98 2438
weighted avg 0.98 0.98 0.98 2438
from itertools import product
X_Reduced, X_Test_Reduced, Y_Reduced, Y_Test_Reduced = train_test_split(X_pca, Y,
test_size=0.30,
random_state=101)
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Reduced,Y_Reduced)
x_min, x_max = X_Reduced[:, 0].min() -1, X_Reduced[:, 0].max() +1
y_min, y_max = X_Reduced[:, 1].min() -1, X_Reduced[:, 1].max() +1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = trainedforest.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z,cmap=plt.cm.coolwarm, alpha=0.4)
plt.scatter(X_Reduced[:, 0], X_Reduced[:, 1], c=Y_Reduced, s=20, edgecolor='k')
plt.xlabel('Principal Component 1', fontsize=12)
plt.ylabel('Principal Component 2', fontsize=12)
plt.title('Random Forest', fontsize=15)
plt.show()
from sklearn.decomposition import FastICA
ica = FastICA(n_components=3)
X_ica = ica.fit_transform(X)
forest_test(X_ica, Y)
[[1263 11]
[ 44 1120]]
precision recall f1-score support
0 0.97 0.99 0.98 1274
1 0.99 0.96 0.98 1164
accuracy 0.98 2438
macro avg 0.98 0.98 0.98 2438
weighted avg 0.98 0.98 0.98 2438
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=1)
# run an LDA and use it to transform the features
X_lda = lda.fit(X, Y).transform(X)
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_lda.shape[1])
Reduced number of features: 1
forest_test(X_lda, Y)
[[1274 0]
[ 0 1164]]
precision recall f1-score support
0 1.00 1.00 1.00 1274
1 1.00 1.00 1.00 1164
accuracy 1.00 2438
macro avg 1.00 1.00 1.00 2438
weighted avg 1.00 1.00 1.00 2438
X_Reduced, X_Test_Reduced, Y_Reduced, Y_Test_Reduced = train_test_split(X_lda, Y,
test_size=0.30,
random_state=101)
start = time.process_time()
lda = LinearDiscriminantAnalysis().fit(X_Reduced,Y_Reduced)
print(time.process_time() - start)
predictionlda = lda.predict(X_Test_Reduced)
print(confusion_matrix(Y_Test_Reduced,predictionlda))
print(classification_report(Y_Test_Reduced,predictionlda))
[[1274 0]
[ 2 1162]]
precision recall f1-score support
0 1.00 1.00 1.00 1274
1 1.00 1.00 1.00 1164
accuracy 1.00 2438
macro avg 1.00 1.00 1.00 2438
weighted avg 1.00 1.00 1.00 2438
from sklearn.manifold import LocallyLinearEmbedding
embedding = LocallyLinearEmbedding(n_components=3)
X_lle = embedding.fit_transform(X)
forest_test(X_lle, Y)
[[1273 0]
[1143 22]]
precision recall f1-score support
0 0.53 1.00 0.69 1273
1 1.00 0.02 0.04 1165
micro avg 0.53 0.53 0.53 2438
macro avg 0.76 0.51 0.36 2438
weighted avg 0.75 0.53 0.38 2438
from sklearn.manifold importTSNE
start = time.process_time()
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
X_tsne = tsne.fit_transform(X)
print(time.process_time() - start)
[t-SNE] Computing 121 nearestneighbors...
[t-SNE] Indexed 8124 samples in 0.139s...
[t-SNE] Computed neighbors for 8124 samples in 11.891s...
[t-SNE] Computed conditional probabilities for sample
1000 / 8124
[t-SNE] Computed conditional probabilities for sample
2000 / 8124
[t-SNE] Computed conditional probabilities for sample
3000 / 8124
[t-SNE] Computed conditional probabilities for sample
4000 / 8124
[t-SNE] Computed conditional probabilities for sample
5000 / 8124
[t-SNE] Computed conditional probabilities for sample
6000 / 8124
[t-SNE] Computed conditional probabilities for sample
7000 / 8124
[t-SNE] Computed conditional probabilities for sample
8000 / 8124
[t-SNE] Computed conditional probabilities for sample
8124 / 8124
[t-SNE] Mean sigma: 2.658530
[t-SNE] KL divergence after 250 iterations with early
exaggeration: 65.601128
[t-SNE] KL divergence after 300 iterations: 1.909915
143.984375
forest_test(X_tsne, Y)
[[1274 0]
[ 0 1164]]
precision recall f1-score support
0 1.00 1.00 1.00 1274
1 1.00 1.00 1.00 1164
accuracy 1.00 2438
macro avg 1.00 1.00 1.00 2438
weighted avg 1.00 1.00 1.00 2438
from keras.layers import Input, Dense
from keras.models import Model
input_layer = Input(shape=(X.shape[1],))
encoded = Dense(3, activation='relu')(input_layer)
decoded = Dense(X.shape[1], activation='softmax')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
X1, X2, Y1, Y2 = train_test_split(X, X, test_size=0.3, random_state=101)
autoencoder.fit(X1, Y1,
epochs=100,
batch_size=300,
shuffle=True,
verbose=30,
validation_data=(X2, Y2))
encoder = Model(input_layer, encoded)
X_ae = encoder.predict(X)
forest_test(X_ae, Y)
[[1238 36]
[ 67 1097]]
precision recall f1-score support
0 0.95 0.97 0.96 1274
1 0.97 0.94 0.96 1164
micro avg 0.96 0.96 0.96 2438
macro avg 0.96 0.96 0.96 2438
weighted avg 0.96 0.96 0.96 2438
--END--
往日热文:
几行代码轻松实现,Tensorlayer 2.0推出深度强化学习基准库
华为发放20亿元奖金,抗击美国断供,奖励员工加班奋斗,网友:他们应得的
TensorFlow 2.0中文开源书项目:日赞700,登上GitHub热榜
喜欢本文的朋友们,欢迎长按下图关注订阅号码农真经
收看更多精彩内
你在看吗?一起成长