111 lines
4.0 KiB
Python
111 lines
4.0 KiB
Python
import json, os, pickle, requests, shelve, sys, time
|
|
|
|
#############################################################################
|
|
# #
|
|
# API calls to obtain taxonomic information. Used in case of name changes. #
|
|
# #
|
|
# See documention at https://api.inaturalist.org/v1/docs/#/Taxa #
|
|
# #
|
|
# We throttle the number of calls to less than 60 per minute. We also #
|
|
# implement a cache to avoid repeated lookups of the same taxa across runs. #
|
|
# Cache entries include time stamps and they expire after two weeks. #
|
|
# #
|
|
#############################################################################
|
|
|
|
API_HOST = "https://api.inaturalist.org/v1"
|
|
CACHE_EXPIRATION = 14 * 24 * 3600 # cache expires after 2 weeks
|
|
TOO_MANY_API_CALLS_DELAY = 60 # wait this long after error 429
|
|
|
|
# The cache stores the json responses.
|
|
|
|
if sys.platform == 'win32':
|
|
DATA_DIR = os.path.join(os.path.expanduser('~'),
|
|
'AppData', 'Local', 'inat_api')
|
|
else:
|
|
DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'inat_api')
|
|
|
|
if not os.path.exists(DATA_DIR):
|
|
os.makedirs(DATA_DIR)
|
|
|
|
cache = shelve.open(os.path.join(DATA_DIR, 'api.cache'))
|
|
|
|
# API call throttling.
|
|
|
|
class Throttle:
|
|
|
|
API_MAX_CALLS = 60 # max 60 calls per minute
|
|
API_INTERVAL = 60 # 1 minute
|
|
|
|
def __init__(self):
|
|
self.callTimes = [] # times of api calls
|
|
|
|
# wait if necessary to avoid more than API_MAX_CALLS in API_INTERVAL
|
|
def wait(self):
|
|
while len(self.callTimes) >= self.API_MAX_CALLS:
|
|
waitTime = self.callTimes[0] - (time.time() - self.API_INTERVAL)
|
|
if waitTime > 0:
|
|
print('Throttling API calls, '
|
|
f'sleeping for {waitTime:.1f} seconds.')
|
|
time.sleep(waitTime)
|
|
continue
|
|
self.callTimes = self.callTimes[1:]
|
|
self.callTimes.append(time.time())
|
|
|
|
api_call_throttle = Throttle()
|
|
|
|
# argument is an id or a list of id's
|
|
def get_taxa_by_id(id):
|
|
if type(id) is list:
|
|
url = API_HOST + '/taxa/' + '%2C'.join([str(i) for i in id])
|
|
else:
|
|
url = API_HOST + f'/taxa/{id}'
|
|
tim = time.time()
|
|
if not url in cache or cache[url][0] < tim - CACHE_EXPIRATION:
|
|
delay = TOO_MANY_API_CALLS_DELAY
|
|
headers = {'Content-type' : 'application/json' }
|
|
while True:
|
|
api_call_throttle.wait()
|
|
response = requests.get(url, headers=headers)
|
|
if response.status_code == requests.codes.too_many:
|
|
time.sleep(delay)
|
|
delay *= 2
|
|
else:
|
|
break
|
|
if response.status_code == requests.codes.ok:
|
|
cache[url] = (tim, response.json())
|
|
else:
|
|
print(response.text)
|
|
return None
|
|
return cache[url][1]
|
|
|
|
# returns taxa by name
|
|
def get_taxa(params):
|
|
url = API_HOST + '/taxa'
|
|
for key, val in params.items():
|
|
if type(val) == bool:
|
|
params[key] = 'true' if val else 'false'
|
|
key = pickle.dumps((url, params)).hex()
|
|
tim = time.time()
|
|
if not key in cache or cache[key][0] < tim - CACHE_EXPIRATION:
|
|
delay = TOO_MANY_API_CALLS_DELAY
|
|
headers = {'Content-type' : 'application/json' }
|
|
while True:
|
|
api_call_throttle.wait()
|
|
response = requests.get(url, headers=headers, params=params)
|
|
if response.status_code == requests.codes.too_many:
|
|
time.sleep(delay)
|
|
delay *= 2
|
|
else:
|
|
break
|
|
if response.status_code == requests.codes.ok:
|
|
cache[key] = (tim, response.json())
|
|
else:
|
|
print(response.text)
|
|
return None
|
|
return cache[key][1]
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
assert not 'Not a top-level Python module!'
|