Randomized-Search
Sun 29 June 2025
import pyutil as pyu
pyu.get_local_pyinfo()
'conda env: C:\\Users\\Afia Jahan\\anaconda3\\envs\\py312; pyv: 3.12.11 | packaged by Anaconda, Inc. | (main, Jun 5 2025, 12:58:53) [MSC v.1929 64 bit (AMD64)]'
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint
# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)
# Preprocessing
# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']
# Handle missing values
X.loc[:, 'Age'] = X['Age'].fillna(X['Age'].median())
X.loc[:, 'Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])
# Convert categorical variables to numerical
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define the model
model = RandomForestClassifier(random_state=42)
# Define the hyperparameter distributions
param_distributions = {
'n_estimators': randint(100, 500), # Randomly choose from 100 to 500
'max_depth': randint(5, 20), # Randomly choose from 5 to 20
'min_samples_split': randint(2, 10), # Randomly choose from 2 to 10
}
# RandomizedSearchCV
random_search = RandomizedSearchCV(
estimator=model,
param_distributions=param_distributions,
n_iter=50, # Number of random combinations to try
cv=5, # 5-fold cross-validation
scoring='accuracy',
random_state=42,
n_jobs=-1 # Use all available cores
)
random_search.fit(X_train, y_train)
RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
n_iter=50, n_jobs=-1,
param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000173E76B6DE0>,
'min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000173E79D85C0>,
'n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x00000173E768D610>},
random_state=42, scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| estimator | RandomForestC...ndom_state=42) | |
| param_distributions | {'max_depth': <scipy.stats....00173E76B6DE0>, 'min_samples_split': <scipy.stats....00173E79D85C0>, 'n_estimators': <scipy.stats....00173E768D610>} | |
| n_iter | 50 | |
| scoring | 'accuracy' | |
| n_jobs | -1 | |
| refit | True | |
| cv | 5 | |
| verbose | 0 | |
| pre_dispatch | '2*n_jobs' | |
| random_state | 42 | |
| error_score | nan | |
| return_train_score | False |
RandomForestClassifier(max_depth=7, n_estimators=491, random_state=42)
Parameters
| n_estimators | 491 | |
| criterion | 'gini' | |
| max_depth | 7 | |
| min_samples_split | 2 | |
| min_samples_leaf | 1 | |
| min_weight_fraction_leaf | 0.0 | |
| max_features | 'sqrt' | |
| max_leaf_nodes | None | |
| min_impurity_decrease | 0.0 | |
| bootstrap | True | |
| oob_score | False | |
| n_jobs | None | |
| random_state | 42 | |
| verbose | 0 | |
| warm_start | False | |
| class_weight | None | |
| ccp_alpha | 0.0 | |
| max_samples | None | |
| monotonic_cst | None |
# Print best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)
Best Parameters: {'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 491}
Best Score: 0.8342263370432385
# Evaluate on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))# Evaluate on the test set
Test Accuracy: 0.8156424581005587
Score: 10
Category: basics