import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv("data.csv")
# Filter data for the years 2018 and 2019
data_2018 = df[df['Year'] == 2018]
data_2019 = df[df['Year'] == 2019]
data_2019.head()
# Find existing customers in 2019 who were also present in 2018
existing_customers_2019 = data_2019[data_2019['CLIENTNUM'].isin(data_2018['CLIENTNUM'])].copy()
Attrited_customers = data_2019[data_2019['Attrition_Flag'] == 1]
Attrited_customers = Attrited_customers.drop(['Attrition_Flag','Customer_Age','Dependent_count', 'Education_Level', 'Marital_Status', 'Months_on_book','Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Trans_Ct', 'Quarter','Year', 'Date_Leave' ], axis = 1)
Attrited_customers.info()<class 'pandas.core.frame.DataFrame'> Index: 945 entries, 22 to 19935 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 945 non-null int64 1 Gender 945 non-null int64 2 Income_Category 945 non-null int64 3 Card_Category 945 non-null int64 4 Credit_Limit 945 non-null float64 5 Total_Revolving_Bal 945 non-null int64 6 Avg_Open_To_Buy 945 non-null float64 7 Avg_Utilization_Ratio 945 non-null float64 dtypes: float64(3), int64(5) memory usage: 66.4 KB
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Recall the Total-HP Dataset
Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size = 0.25)
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()
# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()
# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()Intercept of Regression : b = [0.02162967] Coefficients of Regression : a = [[0.00023298]] Goodness of Fit of Model Train Dataset Explained Variance (R^2) : 0.23609549399620822 Mean Squared Error (MSE) : 0.024699462444787504 Goodness of Fit of Model Test Dataset Explained Variance (R^2) : 0.26504101111698075 Mean Squared Error (MSE) : 0.021625155158817442
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Recall the Total-HP Dataset
Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size = 0.25)
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()
# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()
# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()Intercept of Regression : b = [0.30571807] Coefficients of Regression : a = [[-1.25929131e-05]] Goodness of Fit of Model Train Dataset Explained Variance (R^2) : 0.34616225654133825 Mean Squared Error (MSE) : 0.02102805583561062 Goodness of Fit of Model Test Dataset Explained Variance (R^2) : 0.31723370603164236 Mean Squared Error (MSE) : 0.020322085519485857
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Recall the Total-HP Dataset
Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size = 0.25)
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()
# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()
# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()Intercept of Regression : b = [0.31886092] Coefficients of Regression : a = [[-1.25557998e-05]] Goodness of Fit of Model Train Dataset Explained Variance (R^2) : 0.3660412485094191 Mean Squared Error (MSE) : 0.02015456433086732 Goodness of Fit of Model Test Dataset Explained Variance (R^2) : 0.322504949330932 Mean Squared Error (MSE) : 0.021175292413368064
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Recall the Total-HP Dataset
Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size=0.25)
# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 4 # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)
# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)
# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()
# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)
# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Avg_Open_To_Buy, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Avg Open To Buy")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()Goodness of Fit of Model (Train) R^2: 0.5120732473508691 MSE: 0.01576471690849705 Goodness of Fit of Model (Test) R^2: 0.5221168866301593 MSE: 0.014176062367685874
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
# Recall the Total-HP Dataset
Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size=0.25)
# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 4 # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)
# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)
# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()
# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)
# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Credit_Limit, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Credit_Limit")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()Goodness of Fit of Model (Train) R^2: 0.5664850144467057 MSE: 0.013820477786514745 Goodness of Fit of Model (Test) R^2: 0.6381425414506026 MSE: 0.011231923059718292
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
# Recall the Total-HP Dataset
Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size=0.25)
# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 3 # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)
# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)
# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()
# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)
# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Total_Revolving_Bal, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Total_Revolving_Bal")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()Goodness of Fit of Model (Train) R^2: 0.2558441943953035 MSE: 0.02441768749007164 Goodness of Fit of Model (Test) R^2: 0.19775500829824022 MSE: 0.02222459861821506
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(