import pandas as pd import seaborn as sb import matplotlib.pyplot as plt # Load the data df = pd.read_csv("data.csv") # Filter data for the years 2018 and 2019 data_2018 = df[df['Year'] == 2018] data_2019 = df[df['Year'] == 2019] data_2019.head() # Find existing customers in 2019 who were also present in 2018 existing_customers_2019 = data_2019[data_2019['CLIENTNUM'].isin(data_2018['CLIENTNUM'])].copy() Attrited_customers = data_2019[data_2019['Attrition_Flag'] == 1] Attrited_customers = Attrited_customers.drop(['Attrition_Flag','Customer_Age','Dependent_count', 'Education_Level', 'Marital_Status', 'Months_on_book','Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Trans_Ct', 'Quarter','Year', 'Date_Leave' ], axis = 1) Attrited_customers.info()
<class 'pandas.core.frame.DataFrame'> Index: 945 entries, 22 to 19935 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 945 non-null int64 1 Gender 945 non-null int64 2 Income_Category 945 non-null int64 3 Card_Category 945 non-null int64 4 Credit_Limit 945 non-null float64 5 Total_Revolving_Bal 945 non-null int64 6 Avg_Open_To_Buy 945 non-null float64 7 Avg_Utilization_Ratio 945 non-null float64 dtypes: float64(3), int64(5) memory usage: 66.4 KB
# Import essential models and functions from sklearn from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error # Recall the Total-HP Dataset Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal']) # Response Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor # Split the Dataset into Train and Test X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size = 0.25) # Linear Regression using Train Data linreg = LinearRegression() # create the linear regression object linreg.fit(X_train, y_train) # train the linear regression model # Coefficients of the Linear Regression line print('Intercept of Regression \t: b = ', linreg.intercept_) print('Coefficients of Regression \t: a = ', linreg.coef_) print() # Predict Total values corresponding to HP y_train_pred = linreg.predict(X_train) y_test_pred = linreg.predict(X_test) # Check the Goodness of Fit (on Train Data) print("Goodness of Fit of Model \tTrain Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred)) print() # Check the Goodness of Fit (on Test Data) print("Goodness of Fit of Model \tTest Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred)) print() # Plot the Predictions vs the True values f, axes = plt.subplots(1, 2, figsize=(24, 12)) axes[0].scatter(y_train, y_train_pred, color = "blue") axes[0].plot(y_train, y_train, 'b', linewidth = 1) axes[0].set_xlabel("True values of the Response Variable (Train)") axes[0].set_ylabel("Predicted values of the Response Variable (Train)") axes[1].scatter(y_test, y_test_pred, color = "green") axes[1].plot(y_test, y_test, 'b', linewidth = 1) axes[1].set_xlabel("True values of the Response Variable (Test)") axes[1].set_ylabel("Predicted values of the Response Variable (Test)") plt.show()
Intercept of Regression : b = [0.02162967] Coefficients of Regression : a = [[0.00023298]] Goodness of Fit of Model Train Dataset Explained Variance (R^2) : 0.23609549399620822 Mean Squared Error (MSE) : 0.024699462444787504 Goodness of Fit of Model Test Dataset Explained Variance (R^2) : 0.26504101111698075 Mean Squared Error (MSE) : 0.021625155158817442
# Import essential models and functions from sklearn from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error # Recall the Total-HP Dataset Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy']) # Response Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor # Split the Dataset into Train and Test X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size = 0.25) # Linear Regression using Train Data linreg = LinearRegression() # create the linear regression object linreg.fit(X_train, y_train) # train the linear regression model # Coefficients of the Linear Regression line print('Intercept of Regression \t: b = ', linreg.intercept_) print('Coefficients of Regression \t: a = ', linreg.coef_) print() # Predict Total values corresponding to HP y_train_pred = linreg.predict(X_train) y_test_pred = linreg.predict(X_test) # Check the Goodness of Fit (on Train Data) print("Goodness of Fit of Model \tTrain Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred)) print() # Check the Goodness of Fit (on Test Data) print("Goodness of Fit of Model \tTest Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred)) print() # Plot the Predictions vs the True values f, axes = plt.subplots(1, 2, figsize=(24, 12)) axes[0].scatter(y_train, y_train_pred, color = "blue") axes[0].plot(y_train, y_train, 'b', linewidth = 1) axes[0].set_xlabel("True values of the Response Variable (Train)") axes[0].set_ylabel("Predicted values of the Response Variable (Train)") axes[1].scatter(y_test, y_test_pred, color = "green") axes[1].plot(y_test, y_test, 'b', linewidth = 1) axes[1].set_xlabel("True values of the Response Variable (Test)") axes[1].set_ylabel("Predicted values of the Response Variable (Test)") plt.show()
Intercept of Regression : b = [0.30571807] Coefficients of Regression : a = [[-1.25929131e-05]] Goodness of Fit of Model Train Dataset Explained Variance (R^2) : 0.34616225654133825 Mean Squared Error (MSE) : 0.02102805583561062 Goodness of Fit of Model Test Dataset Explained Variance (R^2) : 0.31723370603164236 Mean Squared Error (MSE) : 0.020322085519485857
# Import essential models and functions from sklearn from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error # Recall the Total-HP Dataset Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit']) # Response Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor # Split the Dataset into Train and Test X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size = 0.25) # Linear Regression using Train Data linreg = LinearRegression() # create the linear regression object linreg.fit(X_train, y_train) # train the linear regression model # Coefficients of the Linear Regression line print('Intercept of Regression \t: b = ', linreg.intercept_) print('Coefficients of Regression \t: a = ', linreg.coef_) print() # Predict Total values corresponding to HP y_train_pred = linreg.predict(X_train) y_test_pred = linreg.predict(X_test) # Check the Goodness of Fit (on Train Data) print("Goodness of Fit of Model \tTrain Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred)) print() # Check the Goodness of Fit (on Test Data) print("Goodness of Fit of Model \tTest Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred)) print() # Plot the Predictions vs the True values f, axes = plt.subplots(1, 2, figsize=(24, 12)) axes[0].scatter(y_train, y_train_pred, color = "blue") axes[0].plot(y_train, y_train, 'b', linewidth = 1) axes[0].set_xlabel("True values of the Response Variable (Train)") axes[0].set_ylabel("Predicted values of the Response Variable (Train)") axes[1].scatter(y_test, y_test_pred, color = "green") axes[1].plot(y_test, y_test, 'b', linewidth = 1) axes[1].set_xlabel("True values of the Response Variable (Test)") axes[1].set_ylabel("Predicted values of the Response Variable (Test)") plt.show()
Intercept of Regression : b = [0.31886092] Coefficients of Regression : a = [[-1.25557998e-05]] Goodness of Fit of Model Train Dataset Explained Variance (R^2) : 0.3660412485094191 Mean Squared Error (MSE) : 0.02015456433086732 Goodness of Fit of Model Test Dataset Explained Variance (R^2) : 0.322504949330932 Mean Squared Error (MSE) : 0.021175292413368064
# Import essential models and functions from sklearn from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import numpy as np import matplotlib.pyplot as plt import pandas as pd # Recall the Total-HP Dataset Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy']) # Response Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor # Split the Dataset into Train and Test X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size=0.25) # Transform the data to include polynomial terms (try a higher degree for more # curve) degree = 4 # Adjust as needed for the best fit poly = PolynomialFeatures(degree=degree) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.transform(X_test) # Polynomial Regression using Train Data linreg = LinearRegression() linreg.fit(X_train_poly, y_train) # Predict values corresponding to Avg_Open_To_Buy y_train_pred = linreg.predict(X_train_poly) y_test_pred = linreg.predict(X_test_poly) # Check the Goodness of Fit (on Train and Test Data) print("Goodness of Fit of Model (Train)") print("R^2:", linreg.score(X_train_poly, y_train)) print("MSE:", mean_squared_error(y_train, y_train_pred)) print() print("Goodness of Fit of Model (Test)") print("R^2:", linreg.score(X_test_poly, y_test)) print("MSE:", mean_squared_error(y_test, y_test_pred)) print() # Sort the test data to plot a smooth curve X_test_sorted = np.sort(X_test, axis=0) X_test_poly_sorted = poly.transform(X_test_sorted) y_test_pred_sorted = linreg.predict(X_test_poly_sorted) # Plot the polynomial regression line with the original data plt.figure(figsize=(12, 6)) plt.scatter(Avg_Open_To_Buy, Avg_Utilization_Ratio, color='blue', label="Original Data") plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})") plt.xlabel("Avg Open To Buy") plt.ylabel("Avg Utilization Ratio") plt.legend() plt.show()
Goodness of Fit of Model (Train) R^2: 0.5120732473508691 MSE: 0.01576471690849705 Goodness of Fit of Model (Test) R^2: 0.5221168866301593 MSE: 0.014176062367685874
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
# Recall the Total-HP Dataset Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit']) # Response Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor # Split the Dataset into Train and Test X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size=0.25) # Transform the data to include polynomial terms (try a higher degree for more # curve) degree = 4 # Adjust as needed for the best fit poly = PolynomialFeatures(degree=degree) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.transform(X_test) # Polynomial Regression using Train Data linreg = LinearRegression() linreg.fit(X_train_poly, y_train) # Predict values corresponding to Avg_Open_To_Buy y_train_pred = linreg.predict(X_train_poly) y_test_pred = linreg.predict(X_test_poly) # Check the Goodness of Fit (on Train and Test Data) print("Goodness of Fit of Model (Train)") print("R^2:", linreg.score(X_train_poly, y_train)) print("MSE:", mean_squared_error(y_train, y_train_pred)) print() print("Goodness of Fit of Model (Test)") print("R^2:", linreg.score(X_test_poly, y_test)) print("MSE:", mean_squared_error(y_test, y_test_pred)) print() # Sort the test data to plot a smooth curve X_test_sorted = np.sort(X_test, axis=0) X_test_poly_sorted = poly.transform(X_test_sorted) y_test_pred_sorted = linreg.predict(X_test_poly_sorted) # Plot the polynomial regression line with the original data plt.figure(figsize=(12, 6)) plt.scatter(Credit_Limit, Avg_Utilization_Ratio, color='blue', label="Original Data") plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})") plt.xlabel("Credit_Limit") plt.ylabel("Avg Utilization Ratio") plt.legend() plt.show()
Goodness of Fit of Model (Train) R^2: 0.5664850144467057 MSE: 0.013820477786514745 Goodness of Fit of Model (Test) R^2: 0.6381425414506026 MSE: 0.011231923059718292
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(
# Recall the Total-HP Dataset Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal']) # Response Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor # Split the Dataset into Train and Test X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size=0.25) # Transform the data to include polynomial terms (try a higher degree for more # curve) degree = 3 # Adjust as needed for the best fit poly = PolynomialFeatures(degree=degree) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.transform(X_test) # Polynomial Regression using Train Data linreg = LinearRegression() linreg.fit(X_train_poly, y_train) # Predict values corresponding to Avg_Open_To_Buy y_train_pred = linreg.predict(X_train_poly) y_test_pred = linreg.predict(X_test_poly) # Check the Goodness of Fit (on Train and Test Data) print("Goodness of Fit of Model (Train)") print("R^2:", linreg.score(X_train_poly, y_train)) print("MSE:", mean_squared_error(y_train, y_train_pred)) print() print("Goodness of Fit of Model (Test)") print("R^2:", linreg.score(X_test_poly, y_test)) print("MSE:", mean_squared_error(y_test, y_test_pred)) print() # Sort the test data to plot a smooth curve X_test_sorted = np.sort(X_test, axis=0) X_test_poly_sorted = poly.transform(X_test_sorted) y_test_pred_sorted = linreg.predict(X_test_poly_sorted) # Plot the polynomial regression line with the original data plt.figure(figsize=(12, 6)) plt.scatter(Total_Revolving_Bal, Avg_Utilization_Ratio, color='blue', label="Original Data") plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})") plt.xlabel("Total_Revolving_Bal") plt.ylabel("Avg Utilization Ratio") plt.legend() plt.show()
Goodness of Fit of Model (Train) R^2: 0.2558441943953035 MSE: 0.02441768749007164 Goodness of Fit of Model (Test) R^2: 0.19775500829824022 MSE: 0.02222459861821506
/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names warnings.warn(