# 1. Convert Text to Numbers (Feature Engineering)
# We map "Y" to 1 and "N" to 0, etc.
merged_df['Gender_Code'] = merged_df['GENDER'].map({'M': 1, 'F': 0})
merged_df['Car_Owner_Code'] = merged_df['Car_Owner'].map({'Y': 1, 'N': 0})
merged_df['Prop_Owner_Code'] = merged_df['Propert_Owner'].map({'Y': 1, 'N': 0})
# 2. Select the variables to include
features = [
'Annual_income_USD',
'Age',
'Family_Members',
'Years_Employed',
'CHILDREN', # Number of children
'Gender_Code', # 1=Male, 0=Female
'Car_Owner_Code', # 1=Owns Car
'Prop_Owner_Code', # 1=Owns Property
'Mobile_phone', # 1=Has Mobile
'Work_Phone', # 1=Has Work Phone
'Phone', # 1=Has Landline
'EMAIL_ID' # 1=Has Email
]
# 3. Create the Matrix (Drop missing rows to be safe)
data_clean = merged_df.dropna(subset=features + ['is_approved']).copy()
X_data = data_clean[features].values
y_data = data_clean['is_approved'].values
# 4. STANDARDIZE (Crucial!)
# If we don't do this, Income (10,000) will overpower Years Employed (9).
X_mean = X_data.mean(axis=0)
X_std = X_data.std(axis=0)
# Safety check: If a column is all the same (e.g., everyone has a mobile),
# std will be 0. We set it to 1 to avoid dividing by zero.
X_std[X_std == 0] = 1
X_scaled = (X_data - X_mean) / X_std
# Define the coordinates: "These are the names of my predictors"
coords = {"predictor": features}
print(f"Running model with named dimensions: {features}")
print(f"Ready to test {len(features)} variables on {len(data_clean)} applicants.")