Python
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv("data.csv")

# Filter data for the years 2018 and 2019
data_2018 = df[df['Year'] == 2018]
data_2019 = df[df['Year'] == 2019]

data_2019.head()

# Find existing customers in 2019 who were also present in 2018
existing_customers_2019 = data_2019[data_2019['CLIENTNUM'].isin(data_2018['CLIENTNUM'])].copy()
Attrited_customers =  data_2019[data_2019['Attrition_Flag'] == 1]

Attrited_customers = Attrited_customers.drop(['Attrition_Flag','Customer_Age','Dependent_count', 'Education_Level', 'Marital_Status', 'Months_on_book','Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Trans_Ct', 'Quarter','Year', 'Date_Leave' ], axis = 1)
Attrited_customers.info()
<class 'pandas.core.frame.DataFrame'>
Index: 945 entries, 22 to 19935
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CLIENTNUM              945 non-null    int64  
 1   Gender                 945 non-null    int64  
 2   Income_Category        945 non-null    int64  
 3   Card_Category          945 non-null    int64  
 4   Credit_Limit           945 non-null    float64
 5   Total_Revolving_Bal    945 non-null    int64  
 6   Avg_Open_To_Buy        945 non-null    float64
 7   Avg_Utilization_Ratio  945 non-null    float64
dtypes: float64(3), int64(5)
memory usage: 66.4 KB
Python
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Recall the Total-HP Dataset
Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal'])   # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio'])         # Predictor

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size = 0.25)

# Linear Regression using Train Data
linreg = LinearRegression()         # create the linear regression object
linreg.fit(X_train, y_train)        # train the linear regression model

# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()
Intercept of Regression 	: b =  [0.02162967]
Coefficients of Regression 	: a =  [[0.00023298]]

Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 0.23609549399620822
Mean Squared Error (MSE) 	: 0.024699462444787504

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.26504101111698075
Mean Squared Error (MSE) 	: 0.021625155158817442
Python
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Recall the Total-HP Dataset
Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy'])   # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio'])         # Predictor

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size = 0.25)

# Linear Regression using Train Data
linreg = LinearRegression()         # create the linear regression object
linreg.fit(X_train, y_train)        # train the linear regression model

# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()
Intercept of Regression 	: b =  [0.30571807]
Coefficients of Regression 	: a =  [[-1.25929131e-05]]

Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 0.34616225654133825
Mean Squared Error (MSE) 	: 0.02102805583561062

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.31723370603164236
Mean Squared Error (MSE) 	: 0.020322085519485857
Python
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Recall the Total-HP Dataset
Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit'])   # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio'])         # Predictor

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size = 0.25)

# Linear Regression using Train Data
linreg = LinearRegression()         # create the linear regression object
linreg.fit(X_train, y_train)        # train the linear regression model

# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()
Intercept of Regression 	: b =  [0.31886092]
Coefficients of Regression 	: a =  [[-1.25557998e-05]]

Goodness of Fit of Model 	Train Dataset
Explained Variance (R^2) 	: 0.3660412485094191
Mean Squared Error (MSE) 	: 0.02015456433086732

Goodness of Fit of Model 	Test Dataset
Explained Variance (R^2) 	: 0.322504949330932
Mean Squared Error (MSE) 	: 0.021175292413368064
Python
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Recall the Total-HP Dataset
Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy'])   # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio'])         # Predictor

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size=0.25)

# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree =  4  # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)

# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)

# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()

# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)

# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Avg_Open_To_Buy, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Avg Open To Buy")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()
Goodness of Fit of Model (Train)
R^2: 0.5120732473508691
MSE: 0.01576471690849705

Goodness of Fit of Model (Test)
R^2: 0.5221168866301593
MSE: 0.014176062367685874
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
Python
# Recall the Total-HP Dataset
Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit'])   # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio'])         # Predictor

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size=0.25)

# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 4  # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)

# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)

# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()

# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)

# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Credit_Limit, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Credit_Limit")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()
Goodness of Fit of Model (Train)
R^2: 0.5664850144467057
MSE: 0.013820477786514745

Goodness of Fit of Model (Test)
R^2: 0.6381425414506026
MSE: 0.011231923059718292
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(
Python
# Recall the Total-HP Dataset
Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal'])   # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio'])         # Predictor

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size=0.25)

# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 3  # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)

# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)

# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()

# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)

# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Total_Revolving_Bal, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Total_Revolving_Bal")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()
Goodness of Fit of Model (Train)
R^2: 0.2558441943953035
MSE: 0.02441768749007164

Goodness of Fit of Model (Test)
R^2: 0.19775500829824022
MSE: 0.02222459861821506
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
  warnings.warn(