1. 首页
  2. 操作系统
  3. OS
  4. 利用XGBoost和随机森林进行泰坦尼克号生还者预测

利用XGBoost和随机森林进行泰坦尼克号生还者预测

上传者: 2023-03-10 21:25:58上传 ZIP文件 110.06KB 热度 8次
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 读取数据
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 数据预处理
train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
train_data['Embarked'].fillna('S', inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)

# 特征工程
combine = [train_data, test_data]
for data in combine:
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = 0
    data.loc[data['FamilySize'] == 1, 'IsAlone'] = 1
    data['Title'] = data['Name'].str.split(',', expand=True)[1].str.split('.', expand=True)[0]
    data['FareBin'] = pd.qcut(data['Fare'], 4)
    data['AgeBin'] = pd.cut(data['Age'].astype(int), 5)

# 特征编码
for data in combine:
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)
    data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
    data['Title'] = data['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4}).fillna(5).astype(int)
    train_data['FareBand'] = pd.qcut(train_data['Fare'], 4)
    train_data['AgeBand'] = pd.cut(train_data['Age'].astype(int), 5)

X = train_data.drop('Survived', axis=1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features='sqrt', n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Random Forest Accuracy:', round(rf.score(X_test, y_test), 4) * 100, '%')

train = xgb.DMatrix(X_train.values, label=y_train.values)
test = xgb.DMatrix(X_test.values, label=y_test.values)
params = {'max_depth': 3, 'eta': 0.1, 'objective': 'binary:logistic', 'nthread': 4, 'eval_metric': 'error'}
evallist = [(test, 'eval'), (train, 'train')]
num_round = 100
xgb_model = xgb.train(params, train, num_round, evallist, early_stopping_rounds=10)
print('XGBoost Accuracy:', round(xgb_model.best_score, 4) * 100, '%')

# 生还者预测
test = test_data
test_pred = rf.predict(test)

# 生成结果文件
result = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': test_pred})
result.to_csv('titanic_pred.csv', index=False)

用户评论