"""Get base data (Addresses, Names) to form the foundation of the customer generation process.""" import json from pathlib import Path from typing import Any, Dict, List, Union import requests from config import DATA_DIR def read_file_to_list(file_path: Union[str, Path]) -> List[str]: """Read the contents of a file and return them as a list of strings. This function opens the specified file, reads all lines, and returns them as a list where each element is a line from the file. Parameters ---------- file_path : str The path to the file to be read. Returns ------- List[str] A list containing the lines of the file as strings. """ with open(file_path, "r") as file: return file.readlines() def get_datalist(file_path: Path) -> List[str]: """Read a file and return its contents as a list of stripped strings. This function reads the contents of the specified file, removes leading and trailing whitespace from each line, and returns the resulting lines as a list of strings. Parameters ---------- file_path : Path The path to the file to be read, as a pathlib.Path object. Returns ------- List[str] A list containing the stripped lines of the file as strings. """ data_list = read_file_to_list(file_path) for idx in range(len(data_list)): data_list[idx] = data_list[idx].strip() return data_list def select_zips() -> Dict[str, List]: """Select and retrieve information about zip codes within the Avacon Netz area. This function reads a list of German zip codes, filters them based on geographical boundaries approximating the Avacon Netz coverage area, and retrieves additional information for each selected zip code using a public API. The function uses predefined latitude and longitude ranges to determine if a zip code falls within the Avacon Netz area. Returns ------- Dict[str, List] A dictionary containing the following keys, each associated with a list: - 'latitude': Latitudes of selected zip codes - 'longitude': Longitudes of selected zip codes - 'population': Population counts for selected zip codes - 'city': City names for selected zip codes - 'zip_codes': Selected zip codes Notes ----- - This function requires an internet connection to access the API. - The function assumes the existence of a global constant DATA_DIR pointing to the directory containing the 'zip_codes.txt' file. - The API used is 'https://gvz.tuerantuer.org/api/administrative_divisions/'. """ lat_list = [] lon_list = [] pop_list = [] selected_zips = [] city_list = [] # Get list of all zip codes in Germany zip_list = read_file_to_list(DATA_DIR / "zip_codes.txt") # Geographical settings: # The main idea is to roughly describe the Avacon Netz area, then use an API to go through all # zip codes in Germany and only keep those that are inside the longitude and latitude range. # We use the same API to get the actual latutide and longitude for each zip code and the total # population so that we can use this information to weight the sampling of customers. # Approximate Northernmost point of Avacon Netz coverage near Geesthacht to southernmost point # in south Harz mountains close to Ilfeld min_lat, max_lat = 51.618147, 53.432608 # Approximate Westernmost point of Avacon Netz coverage near Vechta to easternmost point near # Reuden/Anhalt min_lon, max_lon = 8.476919, 12.270417 for zip_code in zip_list: if zip_code in selected_zips: continue # Get additional data by using this public API response = requests.get( f"https://gvz.tuerantuer.org/api/administrative_divisions/?search={zip_code}" ) if response.status_code == 200: response_json = response.json() if response_json["count"] > 0: zip_data = response_json["results"][0] lat = zip_data["latitude"] lon = zip_data["longitude"] # Check if the zip code is within the Avacon Netz area if (min_lat <= lat <= max_lat) and (min_lon <= lon <= max_lon): num_associated_zips = len(zip_data["zip_codes"]) selected_zips += zip_data["zip_codes"] # Add any associated zip-codes lat_list += [lat] * num_associated_zips lon_list += [lon] * num_associated_zips pop_list += [zip_data["citizens_total"]] * num_associated_zips city_list += [zip_data["office_city"]] * num_associated_zips else: print(f"No data found for {zip_code}") result = { "latitude": lat_list, "longitude": lon_list, "population": pop_list, "city": city_list, "zip_codes": selected_zips, } return result if __name__ == "__main__": data: Dict[str, Union[list[Any], Dict[str, list[Any]]]] = {} # Get given names and surnames data["surnames"] = get_datalist(DATA_DIR / "names.txt") data["given_names"] = get_datalist(DATA_DIR / "female_gnames.txt") + get_datalist( DATA_DIR / "male_gnames.txt" ) # Get list of street names data["streets"] = get_datalist(DATA_DIR / "street_names.txt") # Get list of all zip codes and select those in the Avacon Netz area. # Also include geographical and population data, which is needed for sampling later. data["zips"] = select_zips() # Save data to file with open(DATA_DIR / "base_data.json", "w") as file: json.dump(data, file, indent=4, ensure_ascii=False)