The script `get_base_data` takes the raw datafiles (such as `names.txt`) and formats them in a common JSON file, which can be later used to randomly generate customer and meter readings data. Additionally, the script filters all eligible zip codes an approximate avacon netz service area and provides some additional information for them. An example output file, `base_data.json` has been added to the repo in a previous commit.
163 lines
5.7 KiB
Python
163 lines
5.7 KiB
Python
"""Get base data (Addresses, Names) to form the foundation of the customer generation process."""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Union
|
|
|
|
import requests
|
|
from config import DATA_DIR
|
|
|
|
|
|
def read_file_to_list(file_path: Union[str, Path]) -> List[str]:
|
|
"""Read the contents of a file and return them as a list of strings.
|
|
|
|
This function opens the specified file, reads all lines, and returns
|
|
them as a list where each element is a line from the file.
|
|
|
|
Parameters
|
|
----------
|
|
file_path : str
|
|
The path to the file to be read.
|
|
|
|
Returns
|
|
-------
|
|
List[str]
|
|
A list containing the lines of the file as strings.
|
|
"""
|
|
with open(file_path, "r") as file:
|
|
return file.readlines()
|
|
|
|
|
|
def get_datalist(file_path: Path) -> List[str]:
|
|
"""Read a file and return its contents as a list of stripped strings.
|
|
|
|
This function reads the contents of the specified file, removes leading
|
|
and trailing whitespace from each line, and returns the resulting lines
|
|
as a list of strings.
|
|
|
|
Parameters
|
|
----------
|
|
file_path : Path
|
|
The path to the file to be read, as a pathlib.Path object.
|
|
|
|
Returns
|
|
-------
|
|
List[str]
|
|
A list containing the stripped lines of the file as strings.
|
|
"""
|
|
data_list = read_file_to_list(file_path)
|
|
for idx in range(len(data_list)):
|
|
data_list[idx] = data_list[idx].strip()
|
|
|
|
return data_list
|
|
|
|
|
|
def select_zips() -> Dict[str, List]:
|
|
"""Select and retrieve information about zip codes within the Avacon Netz area.
|
|
|
|
This function reads a list of German zip codes, filters them based on
|
|
geographical boundaries approximating the Avacon Netz coverage area,
|
|
and retrieves additional information for each selected zip code using
|
|
a public API.
|
|
|
|
The function uses predefined latitude and longitude ranges to determine
|
|
if a zip code falls within the Avacon Netz area.
|
|
|
|
Returns
|
|
-------
|
|
Dict[str, List]
|
|
A dictionary containing the following keys, each associated with a list:
|
|
- 'latitude': Latitudes of selected zip codes
|
|
- 'longitude': Longitudes of selected zip codes
|
|
- 'population': Population counts for selected zip codes
|
|
- 'city': City names for selected zip codes
|
|
- 'zip_codes': Selected zip codes
|
|
|
|
Notes
|
|
-----
|
|
- This function requires an internet connection to access the API.
|
|
- The function assumes the existence of a global constant DATA_DIR
|
|
pointing to the directory containing the 'zip_codes.txt' file.
|
|
- The API used is 'https://gvz.tuerantuer.org/api/administrative_divisions/'.
|
|
"""
|
|
lat_list = []
|
|
lon_list = []
|
|
pop_list = []
|
|
selected_zips = []
|
|
city_list = []
|
|
|
|
# Get list of all zip codes in Germany
|
|
zip_list = read_file_to_list(DATA_DIR / "zip_codes.txt")
|
|
|
|
# Geographical settings:
|
|
# The main idea is to roughly describe the Avacon Netz area, then use an API to go through all
|
|
# zip codes in Germany and only keep those that are inside the longitude and latitude range.
|
|
# We use the same API to get the actual latutide and longitude for each zip code and the total
|
|
# population so that we can use this information to weight the sampling of customers.
|
|
|
|
# Approximate Northernmost point of Avacon Netz coverage near Geesthacht to southernmost point
|
|
# in south Harz mountains close to Ilfeld
|
|
min_lat, max_lat = 51.618147, 53.432608
|
|
# Approximate Westernmost point of Avacon Netz coverage near Vechta to easternmost point near
|
|
# Reuden/Anhalt
|
|
min_lon, max_lon = 8.476919, 12.270417
|
|
|
|
for zip_code in zip_list:
|
|
if zip_code in selected_zips:
|
|
continue
|
|
|
|
# Get additional data by using this public API
|
|
response = requests.get(
|
|
f"https://gvz.tuerantuer.org/api/administrative_divisions/?search={zip_code}"
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
response_json = response.json()
|
|
if response_json["count"] > 0:
|
|
zip_data = response_json["results"][0]
|
|
lat = zip_data["latitude"]
|
|
lon = zip_data["longitude"]
|
|
|
|
# Check if the zip code is within the Avacon Netz area
|
|
if (min_lat <= lat <= max_lat) and (min_lon <= lon <= max_lon):
|
|
num_associated_zips = len(zip_data["zip_codes"])
|
|
selected_zips += zip_data["zip_codes"] # Add any associated zip-codes
|
|
lat_list += [lat] * num_associated_zips
|
|
lon_list += [lon] * num_associated_zips
|
|
pop_list += [zip_data["citizens_total"]] * num_associated_zips
|
|
city_list += [zip_data["office_city"]] * num_associated_zips
|
|
|
|
else:
|
|
print(f"No data found for {zip_code}")
|
|
|
|
result = {
|
|
"latitude": lat_list,
|
|
"longitude": lon_list,
|
|
"population": pop_list,
|
|
"city": city_list,
|
|
"zip_codes": selected_zips,
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
data: Dict[str, Union[list[Any], Dict[str, list[Any]]]] = {}
|
|
|
|
# Get given names and surnames
|
|
data["surnames"] = get_datalist(DATA_DIR / "names.txt")
|
|
data["given_names"] = get_datalist(DATA_DIR / "female_gnames.txt") + get_datalist(
|
|
DATA_DIR / "male_gnames.txt"
|
|
)
|
|
|
|
# Get list of street names
|
|
data["streets"] = get_datalist(DATA_DIR / "street_names.txt")
|
|
|
|
# Get list of all zip codes and select those in the Avacon Netz area.
|
|
# Also include geographical and population data, which is needed for sampling later.
|
|
data["zips"] = select_zips()
|
|
|
|
# Save data to file
|
|
with open(DATA_DIR / "base_data.json", "w") as file:
|
|
json.dump(data, file, indent=4, ensure_ascii=False)
|