feat(add-data): Add script to generate costumers from base data
Add `data_preparation/generate_customers.py`, a script that takes the `base_data.json` file generated by `get_base_data.py` and randomly samples a given number of customers. To simplify things, each customer is assigned exactly one gas and one electricity meter and each of them is read between 1 and 10 times. The full data including meters, meter readings and dates as well as customers and addresses is stored in a final JSON file named `customers.json`.
This commit is contained in:
222
data_preparation/generate_customers.py
Normal file
222
data_preparation/generate_customers.py
Normal file
@@ -0,0 +1,222 @@
|
||||
"""Module to randomly generate a JSON file with customer data for the Avacon app."""
|
||||
|
||||
import json
|
||||
import math
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from config import DATA_DIR
|
||||
from data_utils import DateTimeEncoder
|
||||
|
||||
|
||||
def generate_readings(num_customers: int) -> List[Dict[str, Tuple[str, List[datetime], List[int]]]]:
|
||||
"""Generate simulated meter readings for a specified number of customers.
|
||||
|
||||
This function creates synthetic data for both natural gas and electricity
|
||||
meter readings. It simulates readings based on average consumption patterns
|
||||
in Germany, applying random variations to model real-world scenarios.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_customers : int
|
||||
The number of customers for which to generate meter readings.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[Dict[str, Tuple[str, List[datetime], List[int]]]]
|
||||
A list of dictionaries, where each dictionary represents a customer and
|
||||
contains:
|
||||
- 'electricity': A tuple of (meter number, list of reading dates, list of readings)
|
||||
- 'gas': A tuple of (meter number, list of reading dates, list of readings)
|
||||
"""
|
||||
|
||||
# NOTE: Of course, natural gas and electricity consumption depend on various factors, such as
|
||||
# size of the household, age of the building, insulation, etc. For the sake of this example,
|
||||
# we will simply take the known average value and sample from a Gaussian distribution around
|
||||
# this value, with a standard deviation of 10% of the mean.
|
||||
# For the mean we assumed the average size of a flat in Germany (90m²) and the average
|
||||
# consumption of 140 kWh/m², which we need to convert to m³ for the meter reading. To do this,
|
||||
# I am using the calorific value of natural gas. I assumed a value around 11.215 kWh/m³. I also
|
||||
# need to account for slight pressure differences using the conversion factor ("Zustandszahl").
|
||||
mean_natural_gas = 12600 # in kWh
|
||||
calorific_value = 11.215 # in kWh/m³
|
||||
conversion_factor = 0.9692
|
||||
|
||||
mean_cubic = mean_natural_gas / (calorific_value * conversion_factor)
|
||||
|
||||
# For electricity, we take as average consumption the one of a 2-person household in Germany.
|
||||
mean_electricity = 3500 # in kWh
|
||||
|
||||
readings = []
|
||||
# For each customer, generate between 1 and 10 readings (we assume that natural gas and
|
||||
# electricity are always read at the same time)
|
||||
for _ in range(num_customers):
|
||||
# The initial reading of the customers meter
|
||||
gas_reading = random.randint(1000, 60000)
|
||||
elt_reading = random.randint(1000, 600_000)
|
||||
|
||||
# Create an avacon-style meter number
|
||||
gas_meter_number = generate_meter_number()
|
||||
elt_meter_number = generate_meter_number()
|
||||
|
||||
num_readings = random.randint(1, 10)
|
||||
|
||||
# Get initial timestamp: Assuming that each reading takes place once a year around a similar
|
||||
# date, we just take today's date and subtract a number of years corresponding to the number
|
||||
# of readings
|
||||
init_date = generate_past_date_with_variance(num_readings)
|
||||
tmp_gas_dates: list[datetime] = []
|
||||
tmp_elt_dates: list[datetime] = []
|
||||
tmp_gas_readings = []
|
||||
tmp_elt_readings = []
|
||||
for j in range(num_readings):
|
||||
time_diff = 0
|
||||
if j > 0:
|
||||
time_diff = 365 + random.randint(-50, 50)
|
||||
|
||||
gas_date = tmp_gas_dates[-1] + timedelta(days=time_diff) if j > 0 else init_date
|
||||
|
||||
# Electricity is around a similar date as natural gas
|
||||
elt_date = gas_date + timedelta(days=random.randint(-10, 10))
|
||||
|
||||
# Generate random readings
|
||||
gas_reading += int(random.gauss(mean_cubic, mean_cubic * 0.1))
|
||||
elt_reading += int(random.gauss(mean_electricity, mean_electricity * 0.1))
|
||||
|
||||
# Append to temporary lists
|
||||
tmp_gas_dates.append(gas_date)
|
||||
tmp_elt_dates.append(elt_date)
|
||||
tmp_gas_readings.append(gas_reading)
|
||||
tmp_elt_readings.append(elt_reading)
|
||||
|
||||
# Append to final list
|
||||
full_readings_dict = {
|
||||
"electricity": (elt_meter_number, tmp_elt_dates, tmp_elt_readings),
|
||||
"gas": (gas_meter_number, tmp_gas_dates, tmp_gas_readings),
|
||||
}
|
||||
readings.append(full_readings_dict)
|
||||
|
||||
return readings
|
||||
|
||||
|
||||
def generate_past_date_with_variance(years_ago):
|
||||
# Get current date (ignoring time)
|
||||
current_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
# Subtract the specified number of years
|
||||
past_date = current_date.replace(year=current_date.year - years_ago)
|
||||
|
||||
# Generate a random number of days between -50 and 50
|
||||
days_variance = random.randint(-50, 50)
|
||||
|
||||
# Apply the variance
|
||||
final_date = past_date + timedelta(days=days_variance)
|
||||
|
||||
return final_date
|
||||
|
||||
|
||||
def weighted_random_int(min_value: int = 1, max_value: int = 120) -> int:
|
||||
"""Generate a random integer with a logarithmic distribution.
|
||||
|
||||
This function produces random integers between min_value and max_value (inclusive),
|
||||
with a distribution skewed towards lower numbers. It uses a logarithmic
|
||||
transformation to achieve this weighted distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
min_value : int, optional
|
||||
The minimum value of the range (inclusive). Default is 1.
|
||||
max_value : int, optional
|
||||
The maximum value of the range (inclusive). Default is 120.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
A random integer between min_value and max_value, with a distribution
|
||||
skewed towards lower numbers.
|
||||
"""
|
||||
r = random.random()
|
||||
|
||||
# Apply a logarithmic transformation to skew towards lower numbers
|
||||
value = math.exp(r * math.log(max_value - min_value + 1)) + min_value - 1
|
||||
|
||||
# Round down to the nearest integer
|
||||
return int(math.floor(value))
|
||||
|
||||
|
||||
def generate_meter_number() -> str:
|
||||
"""Generate a random meter number in a specific format.
|
||||
|
||||
This function creates a meter number string in the format "X.YYY.ZZZ.Q",
|
||||
where X and Q are single digits, and YYY and ZZZ are three-digit numbers.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
A randomly generated meter number string in the format "X.YYY.ZZZ.Q".
|
||||
"""
|
||||
return (
|
||||
f"{random.randint(1, 9)}.{random.randint(100, 999)}."
|
||||
f"{random.randint(100, 999)}.{random.randint(1, 9)}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Generate data for app
|
||||
|
||||
# Number of customers to generate
|
||||
num_customers = 1000
|
||||
|
||||
# Load base data file
|
||||
# NOTE: All of this information is publicly available, see readme for sources of information
|
||||
# that were used
|
||||
with open(DATA_DIR / "base_data.json", "r") as file:
|
||||
base_data = json.load(file)
|
||||
|
||||
zip_results = base_data["zips"]
|
||||
|
||||
# Create weighted population-weighted sample of customers
|
||||
df = pd.DataFrame(zip_results)
|
||||
df["weight"] = df["population"] / df["population"].sum()
|
||||
selected_zips = df.sample(n=num_customers, weights="weight", replace=True)
|
||||
|
||||
# Generate customer names
|
||||
print("Generating customers data...")
|
||||
c_given_names = random.choices(base_data["given_names"], k=num_customers)
|
||||
c_surnames = random.choices(base_data["surnames"], k=num_customers)
|
||||
|
||||
# Generate addresses
|
||||
print("Generating address data...")
|
||||
c_streets = random.choices(base_data["streets"], k=num_customers)
|
||||
|
||||
# For street numbers, we just generate a random number between 1 and 120, weighted such that the
|
||||
# lower numbers are more likely to occur
|
||||
house_numbers = [weighted_random_int(1, 120) for _ in range(num_customers)]
|
||||
|
||||
# Finally, create meter readings
|
||||
print("Generating meter readings...")
|
||||
readings = generate_readings(num_customers)
|
||||
|
||||
# Create a final list of customers and store as JSON
|
||||
print("Creating final JSON file...")
|
||||
customers = []
|
||||
for i in range(num_customers):
|
||||
customers.append(
|
||||
{
|
||||
"given_name": c_given_names[i],
|
||||
"surname": c_surnames[i],
|
||||
"street": c_streets[i],
|
||||
"house_number": house_numbers[i],
|
||||
"city": selected_zips.iloc[i]["city"],
|
||||
"zip_code": selected_zips.iloc[i]["zip_codes"],
|
||||
"longitude": float(selected_zips.iloc[i]["longitude"]),
|
||||
"latitude": float(selected_zips.iloc[i]["latitude"]),
|
||||
"readings_elt": readings[i]["electricity"],
|
||||
"readings_gas": readings[i]["gas"],
|
||||
}
|
||||
)
|
||||
|
||||
with open(DATA_DIR / "customers.json", "w") as file:
|
||||
json.dump(customers, file, indent=4, ensure_ascii=False, cls=DateTimeEncoder)
|
||||
Reference in New Issue
Block a user