diff --git a/data_preparation/generate_customers.py b/data_preparation/generate_customers.py new file mode 100644 index 0000000..925bc79 --- /dev/null +++ b/data_preparation/generate_customers.py @@ -0,0 +1,222 @@ +"""Module to randomly generate a JSON file with customer data for the Avacon app.""" + +import json +import math +import random +from datetime import datetime, timedelta +from typing import Dict, List, Tuple + +import pandas as pd +from config import DATA_DIR +from data_utils import DateTimeEncoder + + +def generate_readings(num_customers: int) -> List[Dict[str, Tuple[str, List[datetime], List[int]]]]: + """Generate simulated meter readings for a specified number of customers. + + This function creates synthetic data for both natural gas and electricity + meter readings. It simulates readings based on average consumption patterns + in Germany, applying random variations to model real-world scenarios. + + Parameters + ---------- + num_customers : int + The number of customers for which to generate meter readings. + + Returns + ------- + List[Dict[str, Tuple[str, List[datetime], List[int]]]] + A list of dictionaries, where each dictionary represents a customer and + contains: + - 'electricity': A tuple of (meter number, list of reading dates, list of readings) + - 'gas': A tuple of (meter number, list of reading dates, list of readings) + """ + + # NOTE: Of course, natural gas and electricity consumption depend on various factors, such as + # size of the household, age of the building, insulation, etc. For the sake of this example, + # we will simply take the known average value and sample from a Gaussian distribution around + # this value, with a standard deviation of 10% of the mean. + # For the mean we assumed the average size of a flat in Germany (90m²) and the average + # consumption of 140 kWh/m², which we need to convert to m³ for the meter reading. To do this, + # I am using the calorific value of natural gas. I assumed a value around 11.215 kWh/m³. I also + # need to account for slight pressure differences using the conversion factor ("Zustandszahl"). + mean_natural_gas = 12600 # in kWh + calorific_value = 11.215 # in kWh/m³ + conversion_factor = 0.9692 + + mean_cubic = mean_natural_gas / (calorific_value * conversion_factor) + + # For electricity, we take as average consumption the one of a 2-person household in Germany. + mean_electricity = 3500 # in kWh + + readings = [] + # For each customer, generate between 1 and 10 readings (we assume that natural gas and + # electricity are always read at the same time) + for _ in range(num_customers): + # The initial reading of the customers meter + gas_reading = random.randint(1000, 60000) + elt_reading = random.randint(1000, 600_000) + + # Create an avacon-style meter number + gas_meter_number = generate_meter_number() + elt_meter_number = generate_meter_number() + + num_readings = random.randint(1, 10) + + # Get initial timestamp: Assuming that each reading takes place once a year around a similar + # date, we just take today's date and subtract a number of years corresponding to the number + # of readings + init_date = generate_past_date_with_variance(num_readings) + tmp_gas_dates: list[datetime] = [] + tmp_elt_dates: list[datetime] = [] + tmp_gas_readings = [] + tmp_elt_readings = [] + for j in range(num_readings): + time_diff = 0 + if j > 0: + time_diff = 365 + random.randint(-50, 50) + + gas_date = tmp_gas_dates[-1] + timedelta(days=time_diff) if j > 0 else init_date + + # Electricity is around a similar date as natural gas + elt_date = gas_date + timedelta(days=random.randint(-10, 10)) + + # Generate random readings + gas_reading += int(random.gauss(mean_cubic, mean_cubic * 0.1)) + elt_reading += int(random.gauss(mean_electricity, mean_electricity * 0.1)) + + # Append to temporary lists + tmp_gas_dates.append(gas_date) + tmp_elt_dates.append(elt_date) + tmp_gas_readings.append(gas_reading) + tmp_elt_readings.append(elt_reading) + + # Append to final list + full_readings_dict = { + "electricity": (elt_meter_number, tmp_elt_dates, tmp_elt_readings), + "gas": (gas_meter_number, tmp_gas_dates, tmp_gas_readings), + } + readings.append(full_readings_dict) + + return readings + + +def generate_past_date_with_variance(years_ago): + # Get current date (ignoring time) + current_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0) + + # Subtract the specified number of years + past_date = current_date.replace(year=current_date.year - years_ago) + + # Generate a random number of days between -50 and 50 + days_variance = random.randint(-50, 50) + + # Apply the variance + final_date = past_date + timedelta(days=days_variance) + + return final_date + + +def weighted_random_int(min_value: int = 1, max_value: int = 120) -> int: + """Generate a random integer with a logarithmic distribution. + + This function produces random integers between min_value and max_value (inclusive), + with a distribution skewed towards lower numbers. It uses a logarithmic + transformation to achieve this weighted distribution. + + Parameters + ---------- + min_value : int, optional + The minimum value of the range (inclusive). Default is 1. + max_value : int, optional + The maximum value of the range (inclusive). Default is 120. + + Returns + ------- + int + A random integer between min_value and max_value, with a distribution + skewed towards lower numbers. + """ + r = random.random() + + # Apply a logarithmic transformation to skew towards lower numbers + value = math.exp(r * math.log(max_value - min_value + 1)) + min_value - 1 + + # Round down to the nearest integer + return int(math.floor(value)) + + +def generate_meter_number() -> str: + """Generate a random meter number in a specific format. + + This function creates a meter number string in the format "X.YYY.ZZZ.Q", + where X and Q are single digits, and YYY and ZZZ are three-digit numbers. + + Returns + ------- + str + A randomly generated meter number string in the format "X.YYY.ZZZ.Q". + """ + return ( + f"{random.randint(1, 9)}.{random.randint(100, 999)}." + f"{random.randint(100, 999)}.{random.randint(1, 9)}" + ) + + +if __name__ == "__main__": + # Generate data for app + + # Number of customers to generate + num_customers = 1000 + + # Load base data file + # NOTE: All of this information is publicly available, see readme for sources of information + # that were used + with open(DATA_DIR / "base_data.json", "r") as file: + base_data = json.load(file) + + zip_results = base_data["zips"] + + # Create weighted population-weighted sample of customers + df = pd.DataFrame(zip_results) + df["weight"] = df["population"] / df["population"].sum() + selected_zips = df.sample(n=num_customers, weights="weight", replace=True) + + # Generate customer names + print("Generating customers data...") + c_given_names = random.choices(base_data["given_names"], k=num_customers) + c_surnames = random.choices(base_data["surnames"], k=num_customers) + + # Generate addresses + print("Generating address data...") + c_streets = random.choices(base_data["streets"], k=num_customers) + + # For street numbers, we just generate a random number between 1 and 120, weighted such that the + # lower numbers are more likely to occur + house_numbers = [weighted_random_int(1, 120) for _ in range(num_customers)] + + # Finally, create meter readings + print("Generating meter readings...") + readings = generate_readings(num_customers) + + # Create a final list of customers and store as JSON + print("Creating final JSON file...") + customers = [] + for i in range(num_customers): + customers.append( + { + "given_name": c_given_names[i], + "surname": c_surnames[i], + "street": c_streets[i], + "house_number": house_numbers[i], + "city": selected_zips.iloc[i]["city"], + "zip_code": selected_zips.iloc[i]["zip_codes"], + "longitude": float(selected_zips.iloc[i]["longitude"]), + "latitude": float(selected_zips.iloc[i]["latitude"]), + "readings_elt": readings[i]["electricity"], + "readings_gas": readings[i]["gas"], + } + ) + + with open(DATA_DIR / "customers.json", "w") as file: + json.dump(customers, file, indent=4, ensure_ascii=False, cls=DateTimeEncoder)