import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from statistics import mean, stdev, median
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import plot_tree
from scipy.stats import ttest_ind
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
# Einlesen der Datei
file = pd.read_csv('carbon_sink_and_costs_for_dryad.csv', sep=',', decimal='.' , header = 0, names=['Year', 'Shrub planting area', 'Carbon sequestration', 'Planting costs per ha','NPV planting costs per ha','Costs of planting'])
print(file.head(8))
print()
# Datenvorverarbeitung
# Überprüfung auf Leerzellen
print(file.isnull().sum())
# Identifizieren von Ausreißern mit Boxplot
# Ausreißer der Spalte 'Shrub Planting area'
col_data = file['Shrub planting area']
plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Shrub planting area'")
plt.ylabel('Shrub planting area')
plt.show()
# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Shrub planting area':\n{outliers}\n")
# Ausreißer Spalte 'Carbon sequestration'
col_data = file['Carbon sequestration']
plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Carbon sequestration'")
plt.ylabel('Carbon sequestration')
plt.show()
# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Carbon sequestration':\n{outliers}\n")
# Ausreißer der Spalte 'Planting costs per ha'
col_data = file['Planting costs per ha']
plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Planting costs per ha'")
plt.ylabel('Planting costs per ha')
plt.show()
# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Planting costs per ha':\n{outliers}\n")
# Ausreißer der Spalte 'NPV planting costs per year'
col_data = file['NPV planting costs per ha']
plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'NPV planting costs per ha'")
plt.ylabel('NPV planting costs per ha')
plt.show()
# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'NPV planting costs per ha':\n{outliers}\n")
# Ausreißer der Spalte 'Costs of planting'
col_data = file['Costs of planting']
plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Costs of planting'")
plt.ylabel('Costs of planting')
plt.show()
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Costs of planting':\n{outliers}\n")
# Histogramm zur Kohlenstoffspeicherung
plt.title("Histogramm Kohlenstoffspeicherung")
plt.xlabel("Gespeicherter Kohlenstoff")
plt.hist(file["Carbon sequestration"],bins=[0, 10000, 30000, 70000, 120000,180000],
color="blue", edgecolor ="black")
plt.show()
# Standardabweichungen der Spalten bestimmen
# Standardabweichung der Spalte 'Shrub Planting area'
mylist =[5441.85, 2146.43,603.80,433.33,166.66,210.33,273.33,1192.00 ]
print("Standardabweichung Shrub Planting Area:", stdev(mylist))
# Standardabweichung der Spalte 'Carbon sequestration'
mylist1=[170658.48,66830.65,12227.54,7242.36,2473.39,4898.12,2221.76,10010.91]
print("Standardabweichung Carbon Sequestration:", stdev(mylist1))
# Standardabweichung der Spalte 'Planting costs per ha'
mylist2=[1500.00,1500.00,150.00,150.00,150.00,2040.00,1800.00,1800.00]
print("Standardabweichung Planting costs per ha:",stdev(mylist2))
# Standardabweichung der Spalte 'NPV planting costs per ha'
mylist3=[4079.44,3777.26,3497.46,3238.39,2998.51,3775.90,2856.37,2644.79]
print("Standardabweichung NPV planting costs per ha:", stdev(mylist3))
# Standardabweichung der Spalte 'Costs of Planting'
mylist4=[22199676.56,8107613.83,2111765.44,1403290.45,499731.17,794184.55,780732.65,3152590.32]
print("Standardabweichung Costs of Planting:", stdev(mylist4))
# Mittelwerte der Spalten bestimmen
# Mittelwert der Spalte 'Shrub planting area'
print("Mittelwert Shrub planting area:",mean(mylist))
# Mittelwert der Spalte 'Carbon sequestration'
print("Mittelwert Carbon sequestration:", mean(mylist1))
# Mittelwert der Spalte 'Planting costs per ha'
print("Mittelwert Planting costs per ha:", mean(mylist2))
# Mittelwert der Spalte 'NPV planting costs per ha'
print("Mittelwert NPV planting costs per ha:", mean(mylist3))
# Mittelwert der Spalte 'Costs of planting'
print("Mittelwert Costs of planting:", mean(mylist4))
# Skalieren der Merkmale
x= file[['Year','Shrub planting area','Carbon sequestration','Planting costs per ha','NPV planting costs per ha','Costs of planting']].values
scaled= preprocessing.MinMaxScaler()
x_scaled= scaled.fit_transform(x)
file1=pd.DataFrame(x_scaled,columns=['Year','Shrub planting area','Carbon sequestration','Planting costs per ha','NPV planting costs per ha','Costs of planting'])
print(file1.head(8))
# Korrelationsanalyse nach Pearson
print(file.corr().round(2))
# Visualisierung in Punktwolken
sns.pairplot(file)
plt.show()
print()
# Visualisierung in einer Heatmap
plt.title("Korrelation der Merkmale - Heatmap")
sns.heatmap(file.corr(), annot = True, cmap="coolwarm")
plt.show()
# Lineare Regression
y = file['Costs of planting'] # Unabhängige Variable
X = file.drop('Carbon sequestration', axis=1) # Zielgröße
x=file[["Shrub planting area"]].values
y=file[["Costs of planting"]].values
print(X.head(6))
print()
print(y.head(6))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
random_state=0)
print(X_train.head(6))
model = LinearRegression().fit(X_train,y_train)
print("Y-Achsenabschnitt:", model.intercept_)
print("Steigung:", model.coef_[0])
# R² bestimmen
# Datensatz in Trainings- und Testdaten aufteilen
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# Lineares Regressionsmodell erstellen und trainieren
model = LinearRegression().fit(X_train, y_train)
# Vorhersagen für Training- und Testdaten
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Bestimmtheitsmaß (R^2) berechnen
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
# Ausgabe der R²-Werte
print(f"R² (Trainingsdaten): {r2_train}")
print(f"R² (Testdaten): {r2_test}")
# Scatterplot
plt.scatter(file['Shrub planting area'], file['Costs of planting'])
plt.ylabel('Costs of planting') #Abhängige Variable
plt.xlabel('Shrub planting area') #Unabhängige Variable
plt.title('Scatterplot zu Shrub planting area & Costs of planting')
plt.show()
# Scatterplot mit Regressionsgerade
x=file[["Shrub planting area"]].values
y=file[["Costs of planting"]].values
model=LinearRegression().fit(x,y)
plt.plot(x,y,'go') #grüner Punkt
plt.title("Scatterplot mit Regressionsgerade")
plt.xlabel("Shrub planting area")
plt.ylabel("Costs of planting")
b=model.intercept_[0]
m=model.coef_[0]
print(b)
a=np.linspace(0, 6000)
plt.plot(a, m*a+b)
plt.show()
# Lineares Regressionsmodell erstellen und trainieren
model = LinearRegression().fit(x, y)
b = model.intercept_[0] # Achsenabschnitt
m = model.coef_[0][0] # Steigung
# Werte ausgeben
print(f"Achsenabschnitt (b): {b}")
print(f"Steigung (m): {m}")
# Regression mit K-Nearest Neighbor
# Durchführung mit k=3
# Definieren der Features (X) und Zielvariable (y)
X = file[["Shrub planting area", "Planting costs per ha", "NPV planting costs
per ha", "Costs of planting"]]
y = file["Carbon sequestration"]
# Aufteilen der Daten in Trainings- und Test-Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Skalieren der Merkmale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
k-Nearest Neighbor Modell initialisieren mit k=3
knn = KNeighborsRegressor(n_neighbors=3)
# Modell trainieren
knn.fit(X_train_scaled, y_train)
# Vorhersage auf dem Testdatensatz
y_pred = knn.predict(X_test_scaled)
# Vorhersagen und tatsächliche Werte in einem DataFrame anzeigen
predictions_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print(predictions_df)
# Entscheidungsbaum
# Features (X) und Zielvariable (y)
X = file[["Shrub planting area", "Planting costs per ha", "NPV planting costs per ha", "Costs of planting"]]
y = file["Carbon sequestration"]
# Daten in Trainings- und Testsets aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=42)
# Entscheidungsbaum für Regression erstellen
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)
# Vorhersagen machen
y_pred = tree.predict(X_test)
# Modell bewerten
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R²-Wert: {r2:.2f}")
# Darstellung des Baums
plt.figure(figsize=(12, 8))
plot_tree(tree, feature_names=X.columns, filled=True, rounded=True)
plt.title("Entscheidungsbaum für 'Carbon sequestration'")
plt.show()
# T-test zur Optimierung der Hypothesen
# FÜR HYPOTHESE 1
# Gruppen für Pflanzenfläche machen
small_plants = file[file['Shrub planting area'] <= 1000]['Carbon sequestration']
large_plants = file[file['Shrub planting area'] > 1000]['Carbon sequestration']
# t-Test durchführen
t_stat, p_value = ttest_ind(small_plants, large_plants)
# Hypothesen testen
alpha = 0.05 # Signifikanzniveau
# Ergebnisse ausgeben
print(f"T-Statistik: {t_stat}")
print(f"P-Wert: {p_value}")
if p_value < alpha:
print("Ergebnis: Die Nullhypothese wird verworfen. Es gibt einen signifikanten Unterschied.")
else:
print("Ergebnis: Die Nullhypothese wird beibehalten. Kein signifikanter Unterschied.")
# FÜR HYPOTHESE 2
# Gruppen für Pflanzenflächen vorgeben
small_area = file[file['Shrub planting area'] <= 1000]['Costs of planting']
large_area = file[file['Shrub planting area'] > 1000]['Costs of planting']
# t-Test durchführen
t_stat, p_value = ttest_ind(small_area, large_area)
# Ergebnisse ausgeben
print(f"T-Statistik: {t_stat}")
print(f"P-Wert: {p_value}")
# Hypothesen testen
alpha = 0.05 # Signifikanzniveau
if p_value < alpha:
print("Ergebnis: Die Nullhypothese wird verworfen. Es gibt einen signifikanten Unterschied.")
else:
print("Ergebnis: Die Nullhypothese wird beibehalten. Es gibt keinen signifikanten Unterschied.")
# Kreuzvalidierung
# Zielvariable (y) und Features (X) definieren
X = file.drop(columns=["Carbon sequestration", "Year"])
y = file["Carbon sequestration"]
# Standardisierung und Lineare Regression in Pipeline
model = make_pipeline(StandardScaler(), LinearRegression())
# Validierung mit 2-Fold
cv_scores = cross_val_score(model, X, y, cv=2, scoring='r2')
# Ergebnisse anzeigen
print("Kreuzvalidierungs-R²-Werte:", cv_scores)
print("Mittleres R²:", cv_scores.mean())