特征工程与特征选择1. 技术分析1.1 特征工程概述特征工程是机器学习成功的关键特征工程流程 特征提取: 从原始数据提取特征 特征转换: 转换特征格式 特征选择: 选择重要特征 特征生成: 创建新特征 特征类型: 数值特征: 连续值 类别特征: 离散值 时间特征: 时间相关 文本特征: 文本数据1.2 特征选择方法特征选择技术 过滤法: 基于统计指标 包裹法: 基于模型性能 嵌入法: 模型内置选择 选择标准: 相关性 互信息 方差阈值1.3 特征工程重要性阶段重要性时间占比特征工程80%60%模型选择15%20%参数调优5%20%2. 核心功能实现2.1 特征提取import pandas as pd import numpy as np class FeatureExtractor: def __init__(self): pass def extract_time_features(self, df, datetime_column): df df.copy() df[datetime_column] pd.to_datetime(df[datetime_column]) df[year] df[datetime_column].dt.year df[month] df[datetime_column].dt.month df[day] df[datetime_column].dt.day df[hour] df[datetime_column].dt.hour df[dayofweek] df[datetime_column].dt.dayofweek df[is_weekend] df[dayofweek].isin([5, 6]).astype(int) return df def extract_text_features(self, df, text_column): df df.copy() df[text_length] df[text_column].apply(len) df[word_count] df[text_column].apply(lambda x: len(x.split())) df[unique_words] df[text_column].apply(lambda x: len(set(x.split()))) return df def extract_statistical_features(self, df, numeric_columns): df df.copy() df[mean_value] df[numeric_columns].mean(axis1) df[max_value] df[numeric_columns].max(axis1) df[min_value] df[numeric_columns].min(axis1) df[range_value] df[max_value] - df[min_value] df[std_value] df[numeric_columns].std(axis1) return df2.2 特征转换class FeatureTransformer: def __init__(self): pass def log_transform(self, df, columns): df df.copy() for col in columns: df[col] np.log1p(df[col]) return df def power_transform(self, df, columns, power0.5): df df.copy() for col in columns: df[col] np.power(df[col], power) return df def binning(self, df, column, bins5, labelsNone): df df.copy() if labels is None: labels [fbin_{i} for i in range(bins)] df[f{column}_bin] pd.cut(df[column], binsbins, labelslabels) return df def interaction_features(self, df, columns): df df.copy() for i, col1 in enumerate(columns): for j, col2 in enumerate(columns): if i j: df[f{col1}_{col2}_interact] df[col1] * df[col2] return df2.3 特征选择from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression from sklearn.ensemble import RandomForestClassifier class FeatureSelector: def __init__(self): pass def select_by_variance(self, df, threshold0.01): from sklearn.feature_selection import VarianceThreshold selector VarianceThreshold(thresholdthreshold) selector.fit(df) selected_columns df.columns[selector.get_support()] return list(selected_columns) def select_by_correlation(self, df, target_column, threshold0.5): corr_matrix df.corr() target_corr corr_matrix[target_column].abs() selected_columns target_corr[target_corr threshold].index.tolist() selected_columns.remove(target_column) return selected_columns def select_by_mutual_info(self, X, y, k10): selector SelectKBest(score_funcmutual_info_regression, kk) selector.fit(X, y) return X.columns[selector.get_support()].tolist() def select_by_importance(self, X, y, n_features10): model RandomForestClassifier() model.fit(X, y) importances pd.Series(model.feature_importances_, indexX.columns) selected importances.nlargest(n_features).index.tolist() return selected def select_by_recursive_elimination(self, X, y, estimator, n_features_to_select10): from sklearn.feature_selection import RFE selector RFE(estimator, n_features_to_selectn_features_to_select) selector.fit(X, y) return X.columns[selector.get_support()].tolist()2.4 特征生成class FeatureGenerator: def __init__(self): pass def generate_polynomial_features(self, df, columns, degree2): from sklearn.preprocessing import PolynomialFeatures poly PolynomialFeatures(degreedegree, include_biasFalse) poly_features poly.fit_transform(df[columns]) feature_names poly.get_feature_names_out(columns) poly_df pd.DataFrame(poly_features, columnsfeature_names) return pd.concat([df, poly_df], axis1) def generate_clustering_features(self, df, columns, n_clusters5): from sklearn.cluster import KMeans kmeans KMeans(n_clustersn_clusters, random_state42) df[cluster] kmeans.fit_predict(df[columns]) return df def generate_aggregation_features(self, df, group_column, agg_columns, agg_funcs[mean, sum]): agg_dict {col: agg_funcs for col in agg_columns} aggregated df.groupby(group_column).agg(agg_dict) aggregated.columns [_.join(col).strip() for col in aggregated.columns.values] return df.merge(aggregated, ongroup_column, howleft)3. 性能对比3.1 特征选择方法对比方法复杂度效果适用场景方差阈值低中数据预处理相关性分析低中线性模型互信息中高非线性关系随机森林高很高通用3.2 特征转换方法对比方法适用数据效果复杂度对数变换偏态数据中低幂变换非线性关系高低分箱连续数据中低3.3 特征生成方法对比方法生成特征数复杂度风险多项式指数增长中过拟合聚类线性增长中不稳定聚合线性增长低数据泄露4. 最佳实践4.1 特征工程流程def feature_engineering_pipeline(df): # 1. 特征提取 extractor FeatureExtractor() df extractor.extract_time_features(df, timestamp) df extractor.extract_statistical_features(df, [feature1, feature2]) # 2. 特征转换 transformer FeatureTransformer() df transformer.log_transform(df, [price]) df transformer.binning(df, age, bins5) # 3. 特征生成 generator FeatureGenerator() df generator.generate_aggregation_features(df, user_id, [amount]) # 4. 特征选择 selector FeatureSelector() selected_columns selector.select_by_importance(df.drop(target, axis1), df[target]) return df[selected_columns [target]]4.2 特征重要性可视化def plot_feature_importance(model, feature_names, top_n10): importances model.feature_importances_ indices np.argsort(importances)[::-1][:top_n] plt.figure(figsize(10, 6)) plt.title(Top Feature Importances) plt.bar(range(top_n), importances[indices]) plt.xticks(range(top_n), [feature_names[i] for i in indices], rotation45) plt.tight_layout() plt.show()5. 总结特征工程是机器学习的核心特征提取从原始数据提取信息特征转换优化特征分布特征选择选择最有信息量的特征特征生成创建新的特征对比数据如下特征工程决定80%的模型性能随机森林特征重要性最可靠避免特征爆炸导致过拟合推荐使用多种方法组合优秀的特征工程是模型成功的关键需要领域知识和经验。