goci
goci copied to clipboard
V2 API performance profiling
I ran into some intermittent performance problems using the associations endpoint (V2 API).
Sending multiple queries to the V2 API was causing response times to increase a lot (I'm within the advertised rate limits). Sequential queries were taking 2 seconds, async queries were taking 10+ seconds last week.
Here's a reproducible example with some test data:
# /// script
# dependencies = [
# "httpx"
# ]
# ///
import asyncio
from time import perf_counter, sleep
import httpx
async def fetch_top_association(rsid: str, client: httpx.AsyncClient, timeout=10.0):
url = "https://www.ebi.ac.uk/gwas/rest/api/v2/associations"
headers = {"accept": "application/json"}
params = {
"rs_id": rsid,
"sort": "p_value",
"direction": "desc",
"page": 0,
"size": 1,
}
response = await client.get(url, headers=headers, params=params, timeout=timeout)
if response.status_code == 404:
return None
response.raise_for_status()
data = response.json()
associations = data.get("_embedded", {}).get("associations", [])
return associations[0] if associations else None
async def fetch_top_associations(rsid_to_query: list[str], rate_limit=15):
times = []
results = {}
semaphore = asyncio.Semaphore(rate_limit)
timeout_s = 10
# create one client shared across all workers
async with httpx.AsyncClient() as client:
async def worker(rsid: str):
async with semaphore:
start = perf_counter()
try:
data = await fetch_top_association(
rsid=rsid, client=client, timeout=timeout_s
)
except httpx.TimeoutException:
print(f"{rsid} query timed out ({timeout_s=})")
data = None
end = perf_counter()
print(f"{rsid} query finished in {end - start:.2f}s")
times.append(end - start)
await asyncio.sleep(1 / rate_limit) # simple rate limit
return rsid, data
tasks = [worker(rsid) for rsid in rsid_to_query]
for rsid, data in await asyncio.gather(*tasks):
results[rsid] = data
average = sum(times) / len(times)
print(f"{average:.2f} seconds per query")
return results
def fetch_top_association_sequential(rsid: str, client: httpx.Client, timeout=10.0):
url = "https://www.ebi.ac.uk/gwas/rest/api/v2/associations"
headers = {"accept": "application/json"}
params = {
"rs_id": rsid,
"sort": "p_value",
"direction": "desc",
"page": 0,
"size": 1,
}
response = client.get(url, headers=headers, params=params, timeout=timeout)
if response.status_code == 404:
return None
response.raise_for_status()
data = response.json()
associations = data.get("_embedded", {}).get("associations", [])
return associations[0] if associations else None
def fetch_top_associations_sequential(rsid_to_query: list[str], rate_limit=15):
results = {}
timeout_s = 10
times = []
with httpx.Client() as client:
for rsid in rsid_to_query:
start = perf_counter()
try:
data = fetch_top_association_sequential(
rsid=rsid, client=client, timeout=timeout_s
)
except httpx.TimeoutException:
print(f"{rsid} query timed out ({timeout_s=})")
data = None
end = perf_counter()
print(f"{rsid} query finished in {end - start:.2f}s")
times.append(end - start)
sleep(1 / rate_limit) # simple rate limit
results[rsid] = data
average = sum(times) / len(times)
print(f"{average:.2f} seconds per query")
return results
if __name__ == "__main__":
with open("rsids.txt") as f:
rsids = f.read().splitlines()
print("Async test:")
asyncio.run(fetch_top_associations(rsid_to_query=rsids))
print("Sequential test:")
fetch_top_associations_sequential(rsid_to_query=rsids)
This afternoon the performance is OK with no timeouts:
$ uv run test_api.py
Async test:
...
1.18 seconds per query
Sequential test:
...
0.48 seconds per query