feat/add-data #3
162
data_preparation/get_base_data.py
Normal file
162
data_preparation/get_base_data.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""Get base data (Addresses, Names) to form the foundation of the customer generation process."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import requests
|
||||
from config import DATA_DIR
|
||||
|
||||
|
||||
def read_file_to_list(file_path: Union[str, Path]) -> List[str]:
|
||||
"""Read the contents of a file and return them as a list of strings.
|
||||
|
||||
This function opens the specified file, reads all lines, and returns
|
||||
them as a list where each element is a line from the file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file_path : str
|
||||
The path to the file to be read.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[str]
|
||||
A list containing the lines of the file as strings.
|
||||
"""
|
||||
with open(file_path, "r") as file:
|
||||
return file.readlines()
|
||||
|
||||
|
||||
def get_datalist(file_path: Path) -> List[str]:
|
||||
"""Read a file and return its contents as a list of stripped strings.
|
||||
|
||||
This function reads the contents of the specified file, removes leading
|
||||
and trailing whitespace from each line, and returns the resulting lines
|
||||
as a list of strings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
file_path : Path
|
||||
The path to the file to be read, as a pathlib.Path object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[str]
|
||||
A list containing the stripped lines of the file as strings.
|
||||
"""
|
||||
data_list = read_file_to_list(file_path)
|
||||
for idx in range(len(data_list)):
|
||||
data_list[idx] = data_list[idx].strip()
|
||||
|
||||
return data_list
|
||||
|
||||
|
||||
def select_zips() -> Dict[str, List]:
|
||||
"""Select and retrieve information about zip codes within the Avacon Netz area.
|
||||
|
||||
This function reads a list of German zip codes, filters them based on
|
||||
geographical boundaries approximating the Avacon Netz coverage area,
|
||||
and retrieves additional information for each selected zip code using
|
||||
a public API.
|
||||
|
||||
The function uses predefined latitude and longitude ranges to determine
|
||||
if a zip code falls within the Avacon Netz area.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dict[str, List]
|
||||
A dictionary containing the following keys, each associated with a list:
|
||||
- 'latitude': Latitudes of selected zip codes
|
||||
- 'longitude': Longitudes of selected zip codes
|
||||
- 'population': Population counts for selected zip codes
|
||||
- 'city': City names for selected zip codes
|
||||
- 'zip_codes': Selected zip codes
|
||||
|
||||
Notes
|
||||
-----
|
||||
- This function requires an internet connection to access the API.
|
||||
- The function assumes the existence of a global constant DATA_DIR
|
||||
pointing to the directory containing the 'zip_codes.txt' file.
|
||||
- The API used is 'https://gvz.tuerantuer.org/api/administrative_divisions/'.
|
||||
"""
|
||||
lat_list = []
|
||||
lon_list = []
|
||||
pop_list = []
|
||||
selected_zips = []
|
||||
city_list = []
|
||||
|
||||
# Get list of all zip codes in Germany
|
||||
zip_list = read_file_to_list(DATA_DIR / "zip_codes.txt")
|
||||
|
||||
# Geographical settings:
|
||||
# The main idea is to roughly describe the Avacon Netz area, then use an API to go through all
|
||||
# zip codes in Germany and only keep those that are inside the longitude and latitude range.
|
||||
# We use the same API to get the actual latutide and longitude for each zip code and the total
|
||||
# population so that we can use this information to weight the sampling of customers.
|
||||
|
||||
# Approximate Northernmost point of Avacon Netz coverage near Geesthacht to southernmost point
|
||||
# in south Harz mountains close to Ilfeld
|
||||
min_lat, max_lat = 51.618147, 53.432608
|
||||
# Approximate Westernmost point of Avacon Netz coverage near Vechta to easternmost point near
|
||||
# Reuden/Anhalt
|
||||
min_lon, max_lon = 8.476919, 12.270417
|
||||
|
||||
for zip_code in zip_list:
|
||||
if zip_code in selected_zips:
|
||||
continue
|
||||
|
||||
# Get additional data by using this public API
|
||||
response = requests.get(
|
||||
f"https://gvz.tuerantuer.org/api/administrative_divisions/?search={zip_code}"
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
response_json = response.json()
|
||||
if response_json["count"] > 0:
|
||||
zip_data = response_json["results"][0]
|
||||
lat = zip_data["latitude"]
|
||||
lon = zip_data["longitude"]
|
||||
|
||||
# Check if the zip code is within the Avacon Netz area
|
||||
if (min_lat <= lat <= max_lat) and (min_lon <= lon <= max_lon):
|
||||
num_associated_zips = len(zip_data["zip_codes"])
|
||||
selected_zips += zip_data["zip_codes"] # Add any associated zip-codes
|
||||
lat_list += [lat] * num_associated_zips
|
||||
lon_list += [lon] * num_associated_zips
|
||||
pop_list += [zip_data["citizens_total"]] * num_associated_zips
|
||||
city_list += [zip_data["office_city"]] * num_associated_zips
|
||||
|
||||
else:
|
||||
print(f"No data found for {zip_code}")
|
||||
|
||||
result = {
|
||||
"latitude": lat_list,
|
||||
"longitude": lon_list,
|
||||
"population": pop_list,
|
||||
"city": city_list,
|
||||
"zip_codes": selected_zips,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data: Dict[str, Union[list[Any], Dict[str, list[Any]]]] = {}
|
||||
|
||||
# Get given names and surnames
|
||||
data["surnames"] = get_datalist(DATA_DIR / "names.txt")
|
||||
data["given_names"] = get_datalist(DATA_DIR / "female_gnames.txt") + get_datalist(
|
||||
DATA_DIR / "male_gnames.txt"
|
||||
)
|
||||
|
||||
# Get list of street names
|
||||
data["streets"] = get_datalist(DATA_DIR / "street_names.txt")
|
||||
|
||||
# Get list of all zip codes and select those in the Avacon Netz area.
|
||||
# Also include geographical and population data, which is needed for sampling later.
|
||||
data["zips"] = select_zips()
|
||||
|
||||
# Save data to file
|
||||
with open(DATA_DIR / "base_data.json", "w") as file:
|
||||
json.dump(data, file, indent=4, ensure_ascii=False)
|
||||
Reference in New Issue
Block a user