import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
# Load the data
df = pd.read_csv("data.csv")
# Filter data for the years 2018 and 2019
data_2018 = df[df['Year'] == 2018]
data_2019 = df[df['Year'] == 2019]
data_2019.head()
# Find existing customers in 2019 who were also present in 2018
existing_customers_2019 = data_2019[data_2019['CLIENTNUM'].isin(data_2018['CLIENTNUM'])].copy()
Attrited_customers = data_2019[data_2019['Attrition_Flag'] == 1]
Attrited_customers = Attrited_customers.drop(['Attrition_Flag','Customer_Age','Dependent_count', 'Education_Level', 'Marital_Status', 'Months_on_book','Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Trans_Ct', 'Quarter','Year', 'Date_Leave' ], axis = 1)
Attrited_customers.info()# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Recall the Total-HP Dataset
Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size = 0.25)
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()
# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()
# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Recall the Total-HP Dataset
Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size = 0.25)
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()
# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()
# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Recall the Total-HP Dataset
Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size = 0.25)
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
# Predict Total values corresponding to HP
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()
# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()
# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'b', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'b', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Recall the Total-HP Dataset
Avg_Open_To_Buy = pd.DataFrame(Attrited_customers['Avg_Open_To_Buy']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Avg_Open_To_Buy, Avg_Utilization_Ratio, test_size=0.25)
# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 4 # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)
# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)
# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()
# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)
# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Avg_Open_To_Buy, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Avg Open To Buy")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()# Recall the Total-HP Dataset
Credit_Limit = pd.DataFrame(Attrited_customers['Credit_Limit']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Credit_Limit, Avg_Utilization_Ratio, test_size=0.25)
# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 4 # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)
# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)
# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()
# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)
# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Credit_Limit, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Credit_Limit")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()# Recall the Total-HP Dataset
Total_Revolving_Bal = pd.DataFrame(Attrited_customers['Total_Revolving_Bal']) # Response
Avg_Utilization_Ratio = pd.DataFrame(Attrited_customers['Avg_Utilization_Ratio']) # Predictor
# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(Total_Revolving_Bal, Avg_Utilization_Ratio, test_size=0.25)
# Transform the data to include polynomial terms (try a higher degree for more
# curve)
degree = 3 # Adjust as needed for the best fit
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
# Polynomial Regression using Train Data
linreg = LinearRegression()
linreg.fit(X_train_poly, y_train)
# Predict values corresponding to Avg_Open_To_Buy
y_train_pred = linreg.predict(X_train_poly)
y_test_pred = linreg.predict(X_test_poly)
# Check the Goodness of Fit (on Train and Test Data)
print("Goodness of Fit of Model (Train)")
print("R^2:", linreg.score(X_train_poly, y_train))
print("MSE:", mean_squared_error(y_train, y_train_pred))
print()
print("Goodness of Fit of Model (Test)")
print("R^2:", linreg.score(X_test_poly, y_test))
print("MSE:", mean_squared_error(y_test, y_test_pred))
print()
# Sort the test data to plot a smooth curve
X_test_sorted = np.sort(X_test, axis=0)
X_test_poly_sorted = poly.transform(X_test_sorted)
y_test_pred_sorted = linreg.predict(X_test_poly_sorted)
# Plot the polynomial regression line with the original data
plt.figure(figsize=(12, 6))
plt.scatter(Total_Revolving_Bal, Avg_Utilization_Ratio, color='blue', label="Original Data")
plt.plot(X_test_sorted, y_test_pred_sorted, color='red', label=f"Polynomial Regression (degree {degree})")
plt.xlabel("Total_Revolving_Bal")
plt.ylabel("Avg Utilization Ratio")
plt.legend()
plt.show()