V2 API performance profiling

Open nebfield opened this issue 7 months ago • 0 comments

I ran into some intermittent performance problems using the associations endpoint (V2 API).

Sending multiple queries to the V2 API was causing response times to increase a lot (I'm within the advertised rate limits). Sequential queries were taking 2 seconds, async queries were taking 10+ seconds last week.

Here's a reproducible example with some test data:

# /// script
# dependencies = [
#   "httpx"
# ]
# ///

import asyncio
from time import perf_counter, sleep

import httpx


async def fetch_top_association(rsid: str, client: httpx.AsyncClient, timeout=10.0):
    url = "https://www.ebi.ac.uk/gwas/rest/api/v2/associations"
    headers = {"accept": "application/json"}
    params = {
        "rs_id": rsid,
        "sort": "p_value",
        "direction": "desc",
        "page": 0,
        "size": 1,
    }

    response = await client.get(url, headers=headers, params=params, timeout=timeout)
    if response.status_code == 404:
        return None
    response.raise_for_status()
    data = response.json()
    associations = data.get("_embedded", {}).get("associations", [])
    return associations[0] if associations else None


async def fetch_top_associations(rsid_to_query: list[str], rate_limit=15):
    times = []
    results = {}
    semaphore = asyncio.Semaphore(rate_limit)
    timeout_s = 10

    # create one client shared across all workers
    async with httpx.AsyncClient() as client:

        async def worker(rsid: str):
            async with semaphore:
                start = perf_counter()
                try:
                    data = await fetch_top_association(
                        rsid=rsid, client=client, timeout=timeout_s
                    )
                except httpx.TimeoutException:
                    print(f"{rsid} query timed out ({timeout_s=})")
                    data = None
                end = perf_counter()
                print(f"{rsid} query finished in {end - start:.2f}s")
                times.append(end - start)
                await asyncio.sleep(1 / rate_limit)  # simple rate limit
                return rsid, data

        tasks = [worker(rsid) for rsid in rsid_to_query]
        for rsid, data in await asyncio.gather(*tasks):
            results[rsid] = data

    average = sum(times) / len(times)
    print(f"{average:.2f} seconds per query")
    return results


def fetch_top_association_sequential(rsid: str, client: httpx.Client, timeout=10.0):
    url = "https://www.ebi.ac.uk/gwas/rest/api/v2/associations"
    headers = {"accept": "application/json"}
    params = {
        "rs_id": rsid,
        "sort": "p_value",
        "direction": "desc",
        "page": 0,
        "size": 1,
    }

    response = client.get(url, headers=headers, params=params, timeout=timeout)
    if response.status_code == 404:
        return None
    response.raise_for_status()
    data = response.json()
    associations = data.get("_embedded", {}).get("associations", [])
    return associations[0] if associations else None


def fetch_top_associations_sequential(rsid_to_query: list[str], rate_limit=15):
    results = {}
    timeout_s = 10
    times = []

    with httpx.Client() as client:
        for rsid in rsid_to_query:
            start = perf_counter()
            try:
                data = fetch_top_association_sequential(
                    rsid=rsid, client=client, timeout=timeout_s
                )
            except httpx.TimeoutException:
                print(f"{rsid} query timed out ({timeout_s=})")
                data = None
            end = perf_counter()
            print(f"{rsid} query finished in {end - start:.2f}s")
            times.append(end - start)
            sleep(1 / rate_limit)  # simple rate limit
            results[rsid] = data

    average = sum(times) / len(times)
    print(f"{average:.2f} seconds per query")
    return results


if __name__ == "__main__":
    with open("rsids.txt") as f:
        rsids = f.read().splitlines()

    print("Async test:")
    asyncio.run(fetch_top_associations(rsid_to_query=rsids))

    print("Sequential test:")
    fetch_top_associations_sequential(rsid_to_query=rsids)

This afternoon the performance is OK with no timeouts:

$ uv run test_api.py
Async test:
...
1.18 seconds per query
Sequential test:
...
0.48 seconds per query

Sep 08 '25 11:09 nebfield