Hola Platzinautas, les recomiendo que investiguen XGBoost un modelo regresor muy poderoso, en este ejemplo en particular obtuvo mejor desempeño que el recomendado con un accuracy un 10% superior, les comparto el código con los dos modelos para que comparen, por otro lado los valores del XGBoost están por defecto, se le pueden configurar y setear como los estimators, y mejorar aun mas.
from sklearn.datasets import load_digits
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
df = pd.read_csv('titanic.csv')
df.drop(['Name', 'Fare'], axis=1, inplace=True)
df.columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)
df.rename(columns ={'Sex_male': 'Sex'}, inplace=True)
X = df.drop('Survived', axis=1)
y= df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=0)
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2, random_state=0)
tree.fit(X_train,y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
model = XGBRegressor()
model.fit(X_train,y_train)
y_train_pre = model.predict(X_train)
y_test_pre = model.predict(X_test)
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(y_train, y_train_pre)
mse_test = mean_squared_error(y_test, y_test_pre)
inverted_mse_train = 1 -mse_train
inverted_mse_test = 1 -mse_test
print("Inverted MSE (Train):", inverted_mse_train)
print("Inverted MSE (Test):", inverted_mse_test)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(X_train)
print('El accuracy en train es: ', train_accuracy)
print('El accuracy en test es: ', test_accuracy)
importances = tree.feature_importances_
columns = X.columns
data = pd.DataFrame([importances], columns=columns)
sns.barplot(data, palette='bright', saturation=2.0, linewidth=2)
plt.show()
¿Quieres ver más aportes, preguntas y respuestas de la comunidad?