# Parallel computing and large dataset handling using Dask
import dask.dataframe as dd
import pandas as pd
import numpy as np
# Generate sample data
dates = pd.date_range(start="2023-01-01", periods=10000, freq="h")
temperature = 20 + 10 * np.random.randn(len(dates))
precipitation = np.random.rand(len(dates))
# Create a Pandas DataFrame
df = pd.DataFrame({
"date": dates,
"temperature": temperature,
"precipitation": precipitation
})
# Convert the Pandas DataFrame to a Dask DataFrame
ddf = dd.from_pandas(df, npartitions=10)
# Calculate the mean temperature per day
ddf["date"] = ddf["date"].dt.date # Convert to date only (without time)
mean_temp_per_day = ddf.groupby("date").temperature.mean().compute()
print("Mean Temperature Per Day:")
mean_temp_per_day.head()
Click Run or press shift + ENTER to run code