diff --git a/data_preparation/get_base_data.py b/data_preparation/get_base_data.py new file mode 100644 index 0000000..407fed7 --- /dev/null +++ b/data_preparation/get_base_data.py @@ -0,0 +1,162 @@ +"""Get base data (Addresses, Names) to form the foundation of the customer generation process.""" + +import json +from pathlib import Path +from typing import Any, Dict, List, Union + +import requests +from config import DATA_DIR + + +def read_file_to_list(file_path: Union[str, Path]) -> List[str]: + """Read the contents of a file and return them as a list of strings. + + This function opens the specified file, reads all lines, and returns + them as a list where each element is a line from the file. + + Parameters + ---------- + file_path : str + The path to the file to be read. + + Returns + ------- + List[str] + A list containing the lines of the file as strings. + """ + with open(file_path, "r") as file: + return file.readlines() + + +def get_datalist(file_path: Path) -> List[str]: + """Read a file and return its contents as a list of stripped strings. + + This function reads the contents of the specified file, removes leading + and trailing whitespace from each line, and returns the resulting lines + as a list of strings. + + Parameters + ---------- + file_path : Path + The path to the file to be read, as a pathlib.Path object. + + Returns + ------- + List[str] + A list containing the stripped lines of the file as strings. + """ + data_list = read_file_to_list(file_path) + for idx in range(len(data_list)): + data_list[idx] = data_list[idx].strip() + + return data_list + + +def select_zips() -> Dict[str, List]: + """Select and retrieve information about zip codes within the Avacon Netz area. + + This function reads a list of German zip codes, filters them based on + geographical boundaries approximating the Avacon Netz coverage area, + and retrieves additional information for each selected zip code using + a public API. + + The function uses predefined latitude and longitude ranges to determine + if a zip code falls within the Avacon Netz area. + + Returns + ------- + Dict[str, List] + A dictionary containing the following keys, each associated with a list: + - 'latitude': Latitudes of selected zip codes + - 'longitude': Longitudes of selected zip codes + - 'population': Population counts for selected zip codes + - 'city': City names for selected zip codes + - 'zip_codes': Selected zip codes + + Notes + ----- + - This function requires an internet connection to access the API. + - The function assumes the existence of a global constant DATA_DIR + pointing to the directory containing the 'zip_codes.txt' file. + - The API used is 'https://gvz.tuerantuer.org/api/administrative_divisions/'. + """ + lat_list = [] + lon_list = [] + pop_list = [] + selected_zips = [] + city_list = [] + + # Get list of all zip codes in Germany + zip_list = read_file_to_list(DATA_DIR / "zip_codes.txt") + + # Geographical settings: + # The main idea is to roughly describe the Avacon Netz area, then use an API to go through all + # zip codes in Germany and only keep those that are inside the longitude and latitude range. + # We use the same API to get the actual latutide and longitude for each zip code and the total + # population so that we can use this information to weight the sampling of customers. + + # Approximate Northernmost point of Avacon Netz coverage near Geesthacht to southernmost point + # in south Harz mountains close to Ilfeld + min_lat, max_lat = 51.618147, 53.432608 + # Approximate Westernmost point of Avacon Netz coverage near Vechta to easternmost point near + # Reuden/Anhalt + min_lon, max_lon = 8.476919, 12.270417 + + for zip_code in zip_list: + if zip_code in selected_zips: + continue + + # Get additional data by using this public API + response = requests.get( + f"https://gvz.tuerantuer.org/api/administrative_divisions/?search={zip_code}" + ) + + if response.status_code == 200: + response_json = response.json() + if response_json["count"] > 0: + zip_data = response_json["results"][0] + lat = zip_data["latitude"] + lon = zip_data["longitude"] + + # Check if the zip code is within the Avacon Netz area + if (min_lat <= lat <= max_lat) and (min_lon <= lon <= max_lon): + num_associated_zips = len(zip_data["zip_codes"]) + selected_zips += zip_data["zip_codes"] # Add any associated zip-codes + lat_list += [lat] * num_associated_zips + lon_list += [lon] * num_associated_zips + pop_list += [zip_data["citizens_total"]] * num_associated_zips + city_list += [zip_data["office_city"]] * num_associated_zips + + else: + print(f"No data found for {zip_code}") + + result = { + "latitude": lat_list, + "longitude": lon_list, + "population": pop_list, + "city": city_list, + "zip_codes": selected_zips, + } + + return result + + +if __name__ == "__main__": + data: Dict[str, Union[list[Any], Dict[str, list[Any]]]] = {} + + # Get given names and surnames + data["surnames"] = get_datalist(DATA_DIR / "names.txt") + data["given_names"] = get_datalist(DATA_DIR / "female_gnames.txt") + get_datalist( + DATA_DIR / "male_gnames.txt" + ) + + # Get list of street names + data["streets"] = get_datalist(DATA_DIR / "street_names.txt") + + # Get list of all zip codes and select those in the Avacon Netz area. + # Also include geographical and population data, which is needed for sampling later. + data["zips"] = select_zips() + + # Save data to file + with open(DATA_DIR / "base_data.json", "w") as file: + json.dump(data, file, indent=4, ensure_ascii=False)