Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from statistics import mean, stdev, median
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import plot_tree
from scipy.stats import ttest_ind
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

# Einlesen der Datei
file = pd.read_csv('carbon_sink_and_costs_for_dryad.csv', sep=',', decimal='.' , header = 0, names=['Year', 'Shrub planting area', 'Carbon sequestration', 'Planting costs per ha','NPV planting costs per ha','Costs of planting'])
print(file.head(8))
print()

# Datenvorverarbeitung

# Überprüfung auf Leerzellen
print(file.isnull().sum())

# Identifizieren von Ausreißern mit Boxplot

# Ausreißer der Spalte 'Shrub Planting area'

col_data = file['Shrub planting area']

plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Shrub planting area'")
plt.ylabel('Shrub planting area')
plt.show()

# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Shrub planting area':\n{outliers}\n")

# Ausreißer Spalte 'Carbon sequestration'

col_data = file['Carbon sequestration']

plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Carbon sequestration'")
plt.ylabel('Carbon sequestration')
plt.show()

# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Carbon sequestration':\n{outliers}\n")

# Ausreißer der Spalte 'Planting costs per ha'

col_data = file['Planting costs per ha']

plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Planting costs per ha'")
plt.ylabel('Planting costs per ha')
plt.show()

# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Planting costs per ha':\n{outliers}\n")

# Ausreißer der Spalte 'NPV planting costs per year'

col_data = file['NPV planting costs per ha']

plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'NPV planting costs per ha'")
plt.ylabel('NPV planting costs per ha')
plt.show()

# IQR-Methode
q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'NPV planting costs per ha':\n{outliers}\n")


# Ausreißer der Spalte 'Costs of planting'

col_data = file['Costs of planting']

plt.boxplot(col_data, showfliers=True)
plt.title("Boxplot für 'Costs of planting'")
plt.ylabel('Costs of planting')
plt.show()

q1 = np.percentile(col_data, 25)
q3 = np.percentile(col_data, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
print(f"Ausreißer in Spalte 'Costs of planting':\n{outliers}\n")

# Histogramm zur Kohlenstoffspeicherung

plt.title("Histogramm Kohlenstoffspeicherung")
plt.xlabel("Gespeicherter Kohlenstoff")
plt.hist(file["Carbon sequestration"],bins=[0, 10000, 30000, 70000, 120000,180000],
color="blue", edgecolor ="black")
plt.show()

# Standardabweichungen der Spalten bestimmen

# Standardabweichung der Spalte 'Shrub Planting area'
mylist =[5441.85, 2146.43,603.80,433.33,166.66,210.33,273.33,1192.00 ]
print("Standardabweichung Shrub Planting Area:", stdev(mylist))

# Standardabweichung der Spalte 'Carbon sequestration'
mylist1=[170658.48,66830.65,12227.54,7242.36,2473.39,4898.12,2221.76,10010.91]
print("Standardabweichung Carbon Sequestration:", stdev(mylist1))

# Standardabweichung der Spalte 'Planting costs per ha'
mylist2=[1500.00,1500.00,150.00,150.00,150.00,2040.00,1800.00,1800.00]
print("Standardabweichung Planting costs per ha:",stdev(mylist2))

# Standardabweichung der Spalte 'NPV planting costs per ha'
mylist3=[4079.44,3777.26,3497.46,3238.39,2998.51,3775.90,2856.37,2644.79]
print("Standardabweichung NPV planting costs per ha:", stdev(mylist3))

# Standardabweichung der Spalte 'Costs of Planting'
mylist4=[22199676.56,8107613.83,2111765.44,1403290.45,499731.17,794184.55,780732.65,3152590.32]
print("Standardabweichung Costs of Planting:", stdev(mylist4))

# Mittelwerte der Spalten bestimmen

# Mittelwert der Spalte 'Shrub planting area'
print("Mittelwert Shrub planting area:",mean(mylist))

# Mittelwert der Spalte 'Carbon sequestration'
print("Mittelwert Carbon sequestration:", mean(mylist1))

# Mittelwert der Spalte 'Planting costs per ha'
print("Mittelwert Planting costs per ha:", mean(mylist2))

# Mittelwert der Spalte 'NPV planting costs per ha'
print("Mittelwert NPV planting costs per ha:", mean(mylist3))

# Mittelwert der Spalte 'Costs of planting'
print("Mittelwert Costs of planting:", mean(mylist4))

# Skalieren der Merkmale

x= file[['Year','Shrub planting area','Carbon sequestration','Planting costs per ha','NPV planting costs per ha','Costs of planting']].values
scaled= preprocessing.MinMaxScaler()
x_scaled= scaled.fit_transform(x)
file1=pd.DataFrame(x_scaled,columns=['Year','Shrub planting area','Carbon sequestration','Planting costs per ha','NPV planting costs per ha','Costs of planting'])
print(file1.head(8))

# Korrelationsanalyse nach Pearson

print(file.corr().round(2))
# Visualisierung in Punktwolken
sns.pairplot(file)
plt.show()
print()

# Visualisierung in einer Heatmap
plt.title("Korrelation der Merkmale - Heatmap")
sns.heatmap(file.corr(), annot = True,  cmap="coolwarm")
plt.show()

# Lineare Regression

y = file['Costs of planting']  # Unabhängige Variable
X = file.drop('Carbon sequestration', axis=1)   # Zielgröße

x=file[["Shrub planting area"]].values
y=file[["Costs of planting"]].values

print(X.head(6))
print()
print(y.head(6))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
random_state=0)
print(X_train.head(6))

model = LinearRegression().fit(X_train,y_train)
print("Y-Achsenabschnitt:", model.intercept_)
print("Steigung:", model.coef_[0])

# R² bestimmen

# Datensatz in Trainings- und Testdaten aufteilen
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Lineares Regressionsmodell erstellen und trainieren
model = LinearRegression().fit(X_train, y_train)

# Vorhersagen für Training- und Testdaten
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Bestimmtheitsmaß (R^2) berechnen
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

# Ausgabe der R²-Werte
print(f"R² (Trainingsdaten): {r2_train}")
print(f"R² (Testdaten): {r2_test}")

# Scatterplot

plt.scatter(file['Shrub planting area'], file['Costs of planting'])
plt.ylabel('Costs of planting') #Abhängige Variable
plt.xlabel('Shrub planting area') #Unabhängige Variable
plt.title('Scatterplot zu Shrub planting area & Costs of planting')
plt.show()

# Scatterplot mit Regressionsgerade

x=file[["Shrub planting area"]].values
y=file[["Costs of planting"]].values
model=LinearRegression().fit(x,y)
plt.plot(x,y,'go') #grüner Punkt
plt.title("Scatterplot mit Regressionsgerade")
plt.xlabel("Shrub planting area")
plt.ylabel("Costs of planting")
b=model.intercept_[0]
m=model.coef_[0]
print(b)
a=np.linspace(0, 6000)
plt.plot(a, m*a+b)
plt.show()

# Lineares Regressionsmodell erstellen und trainieren
model = LinearRegression().fit(x, y)

b = model.intercept_[0]  # Achsenabschnitt
m = model.coef_[0][0]    # Steigung

# Werte ausgeben
print(f"Achsenabschnitt (b): {b}")
print(f"Steigung (m): {m}")

# Regression mit K-Nearest Neighbor

# Durchführung mit k=3

# Definieren der Features (X) und Zielvariable (y)
X = file[["Shrub planting area", "Planting costs per ha", "NPV planting costs
per ha", "Costs of planting"]]
y = file["Carbon sequestration"]

# Aufteilen der Daten in Trainings- und Test-Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Skalieren der Merkmale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k-Nearest Neighbor Modell initialisieren mit k=3
knn = KNeighborsRegressor(n_neighbors=3)

# Modell trainieren
knn.fit(X_train_scaled, y_train)

# Vorhersage auf dem Testdatensatz
y_pred = knn.predict(X_test_scaled)

# Vorhersagen und tatsächliche Werte in einem DataFrame anzeigen
predictions_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
print(predictions_df)

# Entscheidungsbaum

# Features (X) und Zielvariable (y)
X = file[["Shrub planting area", "Planting costs per ha", "NPV planting costs per ha", "Costs of planting"]]
y = file["Carbon sequestration"]

# Daten in Trainings- und Testsets aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=42)

# Entscheidungsbaum für Regression erstellen
tree = DecisionTreeRegressor(random_state=42)
tree.fit(X_train, y_train)

# Vorhersagen machen
y_pred = tree.predict(X_test)

# Modell bewerten
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R²-Wert: {r2:.2f}")

# Darstellung des Baums
plt.figure(figsize=(12, 8))
plot_tree(tree, feature_names=X.columns, filled=True, rounded=True)
plt.title("Entscheidungsbaum für 'Carbon sequestration'")
plt.show()

# T-test zur Optimierung der Hypothesen

# FÜR HYPOTHESE 1
# Gruppen für Pflanzenfläche machen
small_plants = file[file['Shrub planting area'] <= 1000]['Carbon sequestration']
large_plants = file[file['Shrub planting area'] > 1000]['Carbon sequestration']

# t-Test durchführen
t_stat, p_value = ttest_ind(small_plants, large_plants)

# Hypothesen testen
alpha = 0.05  # Signifikanzniveau

# Ergebnisse ausgeben
print(f"T-Statistik: {t_stat}")
print(f"P-Wert: {p_value}")

if p_value < alpha:
    print("Ergebnis: Die Nullhypothese wird verworfen. Es gibt einen signifikanten Unterschied.")
 else:
    print("Ergebnis: Die Nullhypothese wird beibehalten. Kein signifikanter Unterschied.")
    
    
# FÜR HYPOTHESE 2
# Gruppen für Pflanzenflächen vorgeben
small_area = file[file['Shrub planting area'] <= 1000]['Costs of planting']
large_area = file[file['Shrub planting area'] > 1000]['Costs of planting']

# t-Test durchführen
 t_stat, p_value = ttest_ind(small_area, large_area)

# Ergebnisse ausgeben
print(f"T-Statistik: {t_stat}")
print(f"P-Wert: {p_value}")

# Hypothesen testen
 alpha = 0.05  # Signifikanzniveau
 if p_value < alpha:
    print("Ergebnis: Die Nullhypothese wird verworfen. Es gibt einen signifikanten Unterschied.")
 else:
    print("Ergebnis: Die Nullhypothese wird beibehalten. Es gibt keinen signifikanten Unterschied.")
    
# Kreuzvalidierung

# Zielvariable (y) und Features (X) definieren
X = file.drop(columns=["Carbon sequestration", "Year"])
y = file["Carbon sequestration"]

# Standardisierung und Lineare Regression in Pipeline
model = make_pipeline(StandardScaler(), LinearRegression())

# Validierung mit 2-Fold
cv_scores = cross_val_score(model, X, y, cv=2, scoring='r2')

# Ergebnisse anzeigen
print("Kreuzvalidierungs-R²-Werte:", cv_scores)
print("Mittleres R²:", cv_scores.mean())
Ausreißer in Spalte 'Planting costs per ha':
Series([], Name: Planting costs per ha, dtype: float64)

Ausreißer in Spalte 'NPV planting costs per ha':
Series([], Name: NPV planting costs per ha, dtype: float64)

Standardabweichung Shrub Planting Area: 1798.577161083072
Standardabweichung Carbon Sequestration: 58999.97544989597
Standardabweichung Planting costs per ha: 835.0010692893068
Standardabweichung NPV planting costs per ha: 506.2724808694566
Standardabweichung Costs of Planting: 7423975.518478396
Mittelwert Shrub planting area: 1308.46625
Mittelwert Carbon sequestration: 34570.40125
Mittelwert Planting costs per ha: 1136.25
Mittelwert NPV planting costs per ha: 3358.515
Mittelwert Costs of planting: 4881198.12125
R² (Trainingsdaten): 0.9950268862338233
R² (Testdaten): 0.9887217391586005
Kreuzvalidierungs-R²-Werte: [  0.34542497 -40.13961866]
Mittleres R²: -19.897096842870184
show me the code for finding outliers
To find outliers in the dataset, you can use the Interquartile Range (IQR) method. Here's the code to identify outliers for the 'Shrub planting area' column:
why are you not showing the boxplots for my code?
The boxplots are displayed using `plt.show()` in your code, which should render them in a graphical window. If they are not appearing, ensure that your environment supports graphical output (e.g., Jupyter Notebook, Python script with GUI support). If you're running in a non-GUI environment, consider saving the plots to files instead. Here's how to save the boxplots:
Python
# Save boxplot for 'Shrub planting area'
plt.figure()
plt.boxplot(file['Shrub planting area'], showfliers=True)
plt.title("Boxplot für 'Shrub planting area'")
plt.ylabel('Shrub planting area')
plt.savefig('boxplot_shrub_planting_area.png')
plt.close()

# Repeat for other columns as needed
line 3, in <module>
    plt.boxplot(file['Shrub planting area'], showfliers=True)
                ~~~~^^^^^^^^^^^^^^^^^^^^^^^
TypeError: '_io.BufferedWriter' object is not subscriptable
This code saves the boxplot as an image file instead of displaying it. Adjust the filename for each plot accordingly.