feat(add-data): Add script for acquiring base data files

The script `get_base_data` takes the raw datafiles (such as `names.txt`)
and formats them in a common JSON file, which can be later used to
randomly generate customer and meter readings data.

Additionally, the script filters all eligible zip codes an approximate
avacon netz service area and provides some additional information for
them.

An example output file, `base_data.json` has been added to the repo in
a previous commit.
This commit is contained in:
Tobias Quadfasel
2024-08-31 14:10:56 +02:00
parent 22c05e827c
commit 49ff1bbfec

View File

@@ -0,0 +1,162 @@
"""Get base data (Addresses, Names) to form the foundation of the customer generation process."""
import json
from pathlib import Path
from typing import Any, Dict, List, Union
import requests
from config import DATA_DIR
def read_file_to_list(file_path: Union[str, Path]) -> List[str]:
"""Read the contents of a file and return them as a list of strings.
This function opens the specified file, reads all lines, and returns
them as a list where each element is a line from the file.
Parameters
----------
file_path : str
The path to the file to be read.
Returns
-------
List[str]
A list containing the lines of the file as strings.
"""
with open(file_path, "r") as file:
return file.readlines()
def get_datalist(file_path: Path) -> List[str]:
"""Read a file and return its contents as a list of stripped strings.
This function reads the contents of the specified file, removes leading
and trailing whitespace from each line, and returns the resulting lines
as a list of strings.
Parameters
----------
file_path : Path
The path to the file to be read, as a pathlib.Path object.
Returns
-------
List[str]
A list containing the stripped lines of the file as strings.
"""
data_list = read_file_to_list(file_path)
for idx in range(len(data_list)):
data_list[idx] = data_list[idx].strip()
return data_list
def select_zips() -> Dict[str, List]:
"""Select and retrieve information about zip codes within the Avacon Netz area.
This function reads a list of German zip codes, filters them based on
geographical boundaries approximating the Avacon Netz coverage area,
and retrieves additional information for each selected zip code using
a public API.
The function uses predefined latitude and longitude ranges to determine
if a zip code falls within the Avacon Netz area.
Returns
-------
Dict[str, List]
A dictionary containing the following keys, each associated with a list:
- 'latitude': Latitudes of selected zip codes
- 'longitude': Longitudes of selected zip codes
- 'population': Population counts for selected zip codes
- 'city': City names for selected zip codes
- 'zip_codes': Selected zip codes
Notes
-----
- This function requires an internet connection to access the API.
- The function assumes the existence of a global constant DATA_DIR
pointing to the directory containing the 'zip_codes.txt' file.
- The API used is 'https://gvz.tuerantuer.org/api/administrative_divisions/'.
"""
lat_list = []
lon_list = []
pop_list = []
selected_zips = []
city_list = []
# Get list of all zip codes in Germany
zip_list = read_file_to_list(DATA_DIR / "zip_codes.txt")
# Geographical settings:
# The main idea is to roughly describe the Avacon Netz area, then use an API to go through all
# zip codes in Germany and only keep those that are inside the longitude and latitude range.
# We use the same API to get the actual latutide and longitude for each zip code and the total
# population so that we can use this information to weight the sampling of customers.
# Approximate Northernmost point of Avacon Netz coverage near Geesthacht to southernmost point
# in south Harz mountains close to Ilfeld
min_lat, max_lat = 51.618147, 53.432608
# Approximate Westernmost point of Avacon Netz coverage near Vechta to easternmost point near
# Reuden/Anhalt
min_lon, max_lon = 8.476919, 12.270417
for zip_code in zip_list:
if zip_code in selected_zips:
continue
# Get additional data by using this public API
response = requests.get(
f"https://gvz.tuerantuer.org/api/administrative_divisions/?search={zip_code}"
)
if response.status_code == 200:
response_json = response.json()
if response_json["count"] > 0:
zip_data = response_json["results"][0]
lat = zip_data["latitude"]
lon = zip_data["longitude"]
# Check if the zip code is within the Avacon Netz area
if (min_lat <= lat <= max_lat) and (min_lon <= lon <= max_lon):
num_associated_zips = len(zip_data["zip_codes"])
selected_zips += zip_data["zip_codes"] # Add any associated zip-codes
lat_list += [lat] * num_associated_zips
lon_list += [lon] * num_associated_zips
pop_list += [zip_data["citizens_total"]] * num_associated_zips
city_list += [zip_data["office_city"]] * num_associated_zips
else:
print(f"No data found for {zip_code}")
result = {
"latitude": lat_list,
"longitude": lon_list,
"population": pop_list,
"city": city_list,
"zip_codes": selected_zips,
}
return result
if __name__ == "__main__":
data: Dict[str, Union[list[Any], Dict[str, list[Any]]]] = {}
# Get given names and surnames
data["surnames"] = get_datalist(DATA_DIR / "names.txt")
data["given_names"] = get_datalist(DATA_DIR / "female_gnames.txt") + get_datalist(
DATA_DIR / "male_gnames.txt"
)
# Get list of street names
data["streets"] = get_datalist(DATA_DIR / "street_names.txt")
# Get list of all zip codes and select those in the Avacon Netz area.
# Also include geographical and population data, which is needed for sampling later.
data["zips"] = select_zips()
# Save data to file
with open(DATA_DIR / "base_data.json", "w") as file:
json.dump(data, file, indent=4, ensure_ascii=False)