import requests import math import json """ This script extracts official country names based on the UNTERM database. The United Nations Terminology Database is a multilingual database for official terminology relevant to the work of the United Nations Official terminology is provided in the six official languages of the United Nations — Arabic, Chinese, English, French, Spanish and Russian. """ GET_RECORDS_ID_URL = "https://conferences.unite.un.org/untermapi/api/term/search?query=*&page=" GET_INFO_COUNTRY_URL = "https://conferences.unite.un.org/untermapi/api/record/" OUTPUT_FILE_NAME = "un_formal_country_names.json" OUTPUT_FILE_NAME_SMALL = "un_formal_country_names_min.json" REQUEST_HEADERS = { 'content-type': 'application/json', 'accept': 'application/json, text/plain' } REQUEST_DATA = { "searchTerm": "*", "searchType": 0, "searchLanguages": ["ar", "es", "en", "fr", "ru", "zh"], "languagesDisplay": ["en", "fr"], "datasets": ["UNHQ"], "bodies": [], "subjects": ["Country names"], "recordTypes": [], "acronymSearch": True, "localDBSearch": True, "termTitleSearch": True, "phraseologySearch": False, "footnoteSearch": False, "fullTextSearch": False, "facetedSearch": True, "buildSubjectList": False } languagesIDToLanguage = { '5a504ded-6ba9-49c9-98aa-c8418cbb1ce9': 'arabic', '676ccfad-a7b3-4e36-8d49-b21a9b5e0d7d': 'chinese', '77ebac02-6a97-4722-859a-de5896a4d34e': 'english', 'ca7853cf-0f94-4aa6-be0a-ea586361c19e': 'french', '496bcad8-7cd0-4f46-85c7-23b91e7a4ad2': 'russian', 'af320177-7bdb-40df-9521-6d2e0478f129': 'spanish', } def main(): countries = {} recordsID = getRecordsID() for records in recordsID: infoLang = getInfoCountry(records) countries[infoLang['iso_a2']] = infoLang # Sort the dictionnary to get the same result every times. countries = {i2: countries[i2] for i2 in sorted(list(countries.keys()))} with open(OUTPUT_FILE_NAME, 'w') as f: json.dump(countries, f) with open(OUTPUT_FILE_NAME_SMALL, 'w') as f: json.dump(countries, f, ensure_ascii=False) def getRecordsID(): """ This function extracts the recordID of every results of every page so they can be queried later """ def getRecordsIDFromResponse(resp): """ This function extracts all the recordID of a page and return them. """ data = resp.json() if 'results' not in data: print("Missing results in request") exit(4) return [r['recordID'] for r in data['results'] if 'recordID' in r] firstRequest = requests.post(GET_RECORDS_ID_URL + "0", data=json.dumps(REQUEST_DATA), headers=REQUEST_HEADERS) if firstRequest.status_code != 200: print("The first request failed. Check the URL") print(firstRequest.status_code) print(firstRequest.text) exit(1) data = firstRequest.json() if 'results' not in data: print("Missing results") exit(3) recordsID = getRecordsIDFromResponse(firstRequest) nbPage = int(math.ceil(data['totalHits']) / int(data['pageSize'])) for page in range(1, nbPage + 1): print(page) res = requests.post(GET_RECORDS_ID_URL + str(page), data=json.dumps(REQUEST_DATA), headers=REQUEST_HEADERS) if res.status_code != 200: print("Unable to get a page that should be accessible") exit(5) recordsID.extend(getRecordsIDFromResponse(res)) return recordsID def getInfoCountry(recordID): """ This function extract the information of a recordID. Each record should match a country so this function extract the names of a country in the 6 offical UN languages. """ def removeExtraInfo(countryName): if '(' in countryName: return countryName[:countryName.index('(')].strip() else: return countryName res = requests.get(GET_INFO_COUNTRY_URL + recordID) if res.status_code != 200: print("Error {res.status_code} requesting {rec}") exit(2) data = res.json() countryInfo = {} for lang in sorted(data['languages']): lang = lang.lower() countryInfo[lang] = {'short': '', 'full': ''} for i in range(len(data[lang]['terms'])): countryInfo[lang][data[lang]['terms'][i]['termType']] = data[lang]['terms'][i]['term'] # The UK doesn't have official short name in russian. ¯\_(ツ)_/¯ if not countryInfo[lang]['short'] and countryInfo[lang]['full']: countryInfo[lang]['short'] = countryInfo[lang]['full'] try: countryInfo[lang]['map_usable'] = removeExtraInfo(countryInfo[lang]['short']) except KeyError: print(f"What the fuck {lang}?!") print(f"Missing name for this country in {lang}", countryInfo) for field in sorted(data['specialFields'], key=lambda item: item['name']): match field['name']: case "ISO Country alpha-2-code": countryInfo['iso_a2'] = field['value'] case "ISO Country alpha-3-code": countryInfo['iso_a3'] = field['value'] case 'Monetary Unit': if field['languageId'] in languagesIDToLanguage: lang = languagesIDToLanguage[field['languageId']] countryInfo[lang]['monetary_unit'] = field['value'] case 'Currency Designation': if field['languageId'] in languagesIDToLanguage: lang = languagesIDToLanguage[field['languageId']] countryInfo[lang]['currency_designation'] = field['value'] case 'Currency Symbol': if field['languageId'] in languagesIDToLanguage: lang = languagesIDToLanguage[field['languageId']] countryInfo[lang]['currency_symbol'] = field['value'] case 'Fractional Unit': if field['languageId'] in languagesIDToLanguage: lang = languagesIDToLanguage[field['languageId']] countryInfo[lang]['fractional_unit'] = field['value'] case 'Capital City': if field['languageId'] in languagesIDToLanguage: lang = languagesIDToLanguage[field['languageId']] countryInfo[lang]['capital_city'] = field['value'] case 'Adjective': if field['languageId'] in languagesIDToLanguage: lang = languagesIDToLanguage[field['languageId']] countryInfo[lang]['adjective'] = field['value'] case 'Date of Entry in UN': countryInfo['un_entry_date'] = field['value'] return countryInfo if __name__ == "__main__": main()