I upped that call count to 183 for all of ISO space. To get 57 valid abbreviations and languages.
[‘af’, ‘ar’, ‘hy’, ‘az’, ‘be’, ‘bs’, ‘bg’, ‘ca’, ‘zh’, ‘hr’, ‘cs’, ‘da’, ‘nl’, ‘en’, ‘et’, ‘fi’, ‘fr’, ‘gl’, ‘de’, ‘el’, ‘he’, ‘hi’, ‘hu’, ‘is’, ‘id’, ‘it’, ‘ja’, ‘kn’, ‘kk’, ‘ko’, ‘lv’, ‘lt’, ‘mk’, ‘ms’, ‘mi’, ‘mr’, ‘ne’, ‘no’, ‘fa’, ‘pl’, ‘pt’, ‘ro’, ‘ru’, ‘sr’, ‘sk’, ‘sl’, ‘es’, ‘sw’, ‘sv’, ‘tl’, ‘ta’, ‘th’, ‘tr’, ‘uk’, ‘ur’, ‘vi’, ‘cy’]
[‘afrikaans’, ‘arabic’, ‘armenian’, ‘azerbaijani’, ‘belarusian’, ‘bosnian’, ‘bulgarian’, ‘catalan’, ‘chinese’, ‘croatian’, ‘czech’, ‘danish’, ‘dutch’, ‘english’, ‘estonian’, ‘finnish’, ‘french’, ‘galician’, ‘german’, ‘greek’, ‘hebrew’, ‘hindi’, ‘hungarian’, ‘icelandic’, ‘indonesian’, ‘italian’, ‘japanese’, ‘kannada’, ‘kazakh’, ‘korean’, ‘latvian’, ‘lithuanian’, ‘macedonian’, ‘malay’, ‘maori’, ‘marathi’, ‘nepali’, ‘norwegian’, ‘persian’, ‘polish’, ‘portuguese’, ‘romanian’, ‘russian’, ‘serbian’, ‘slovak’, ‘slovenian’, ‘spanish’, ‘swahili’, ‘swedish’, ‘tagalog’, ‘tamil’, ‘thai’, ‘turkish’, ‘ukrainian’, ‘urdu’, ‘vietnamese’, ‘welsh’]
So then, a lookup tool in either direction
def iso639_lookup(lang: str, reverse: bool = None, **junk) -> str:
"""
OpenAI whisper ISO-639-1 language code utility or compatibility - 2024-02
:param lang: The language name or ISO-639-1 code to look up.
:param reverse: If True, find the language name from the ISO-639-1 code.
If False or None, find the ISO-639-1 code from the language name.
:return: The ISO-639-1 code or language name if found, otherwise None.
"""
iso639 = { # 57 languages supported by OpenAI whisper-1
'afrikaans': 'af', 'arabic': 'ar', 'armenian': 'hy',
'azerbaijani': 'az', 'belarusian': 'be', 'bosnian': 'bs',
'bulgarian': 'bg', 'catalan': 'ca', 'chinese': 'zh',
'croatian': 'hr', 'czech': 'cs', 'danish': 'da',
'dutch': 'nl', 'english': 'en', 'estonian': 'et',
'finnish': 'fi', 'french': 'fr', 'galician': 'gl',
'german': 'de', 'greek': 'el', 'hebrew': 'he',
'hindi': 'hi', 'hungarian': 'hu', 'icelandic': 'is',
'indonesian': 'id', 'italian': 'it', 'japanese': 'ja',
'kannada': 'kn', 'kazakh': 'kk', 'korean': 'ko',
'latvian': 'lv', 'lithuanian': 'lt', 'macedonian': 'mk',
'malay': 'ms', 'maori': 'mi', 'marathi': 'mr',
'nepali': 'ne', 'norwegian': 'no', 'persian': 'fa',
'polish': 'pl', 'portuguese': 'pt', 'romanian': 'ro',
'russian': 'ru', 'serbian': 'sr', 'slovak': 'sk',
'slovenian': 'sl', 'spanish': 'es', 'swahili': 'sw',
'swedish': 'sv', 'tagalog': 'tl', 'tamil': 'ta',
'thai': 'th', 'turkish': 'tr', 'ukrainian': 'uk',
'urdu': 'ur', 'vietnamese': 'vi', 'welsh': 'cy'
}
if reverse:
if len(lang) != 2 or not lang.isalpha():
raise ValueError("ISO-639-1 abbreviation must be len=2 letters")
# Find the dict key by searching for the value
for language, abbreviation in iso639.items():
if abbreviation == lang.strip().lower():
return language
return None # None if the code not found
else:
# match input style to dict format, retrieve
formatted_lang = lang.strip().lower()
return iso639.get(formatted_lang) # will be None for unmatched
if __name__=="__main__": # example
lang = "Thai" # your input
# reverse = True # reverse=True finds language from code
iso639_out = iso639_lookup(lang,
reverse if 'reverse' in locals() else None)
if iso639_out:
print(iso639_out)
else:
print("No ISO-639 language match was found or returned.")