Dask

Parallel computing and large dataset handling using Dask

Python
# Parallel computing and large dataset handling using Dask

import dask.dataframe as dd
import pandas as pd
import numpy as np

# Generate sample data
dates = pd.date_range(start="2023-01-01", periods=10000, freq="h")
temperature = 20 + 10 * np.random.randn(len(dates))
precipitation = np.random.rand(len(dates))

# Create a Pandas DataFrame
df = pd.DataFrame({
    "date": dates,
    "temperature": temperature,
    "precipitation": precipitation
})

# Convert the Pandas DataFrame to a Dask DataFrame
ddf = dd.from_pandas(df, npartitions=10)

# Calculate the mean temperature per day
ddf["date"] = ddf["date"].dt.date  # Convert to date only (without time)
mean_temp_per_day = ddf.groupby("date").temperature.mean().compute()
print("Mean Temperature Per Day:")
mean_temp_per_day.head()
Mean Temperature Per Day:
date
2023-01-01    19.288843
2023-01-02    16.317349
2023-01-03    18.172259
2023-01-04    23.717603
2023-01-05    17.301904
Name: temperature, dtype: float64