micropip icon indicating copy to clipboard operation
micropip copied to clipboard

micropip.freeze imports field for PyPi dependencies is insufficient

Open juntyr opened this issue 10 months ago • 5 comments

micropip.freeze currently checks for the non-standard top_level.txt file to fill the imports field. This has a few issues:

  1. When the file is missing, no imports are included
  2. For namespace packages, the top-level imports is just the name of the namespace, but we want the imports field to name the package, e.g. namespace.package instead

A brief online search didn't reveal any good solutions:

  • https://docs.python.org/3/library/importlib.metadata.html#importlib.metadata.packages_distributions can provide a top-level -> packages mapping for already-imported packages (which is good enough for freezing), which would solve (1) but not (2)
  • https://discuss.python.org/t/determining-top-level-import-package-names-programmatically/45055/2 suggests reading the RECORDS file to extract all non-special directories. I would suggest to go even one step further and extract the common prefix for all non-special directories so that we could also translate namespace/package into namespace.package. To allow several adjacent namespace packages, perhaps the rule would be to look for directory prefixes until we find the first __init__.py

juntyr avatar Feb 14 '25 10:02 juntyr

Here's what I was able to code up so far:

import importlib.metadata
from pathlib import Path

packages = set(
    v for vs in importlib.metadata.packages_distributions().values() for v in vs
)

for p in sorted(packages):
    files = importlib.metadata.files(p)

    imports = set()

    tree = dict()

    for f in files:
        # ignore special folders
        if Path(f.parts[0]).suffix in [".libs", ".dist-info", ".data"]:
            continue
        
        # include top-level single-file packages
        if len(f.parts) == 1 and f.suffix == ".py":
            imports.add(f.stem)
            continue

        # build a tree of all other files
        t = tree
        for r in f.parts:
            if t.get(r, None) is None:
                t[r] = dict()
            t = t[r]

    # extract folders that only have folders but no files as children,
    #  these are package candidates
    queue = [([k], t) for k, t in tree.items()]
    while len(queue) > 0:
        ps, tree = queue.pop()

        if len(tree) == 0:
            continue

        imports.add('.'.join(ps))

        is_package = True

        add_to_queue = []

        for k, t in tree.items():
            if len(t) == 0:
                is_package = False
            add_to_queue.append((ps + [k], t))

        if is_package:
            queue += add_to_queue

    # remove prefixes from the list
    new_imports = []
    for i in imports:
        if not any(j.startswith(i) for j in imports if j != i):
            new_imports.append(i)
    
    print(p, sorted(new_imports))

For import earthkit, which is a package and a namespace, this prints the following:

Cartopy ['cartopy']
Jinja2 ['jinja2']
Markdown ['markdown']
MarkupSafe ['markupsafe']
Pint ['pint']
PyYAML ['_yaml', 'yaml']
Pygments ['pygments']
adjustText ['adjustText']
affine ['affine']
annotated-types ['annotated_types']
array_api_compat ['array_api_compat']
asttokens ['asttokens']
attrs ['attrs']
cdsapi ['cdsapi']
certifi ['certifi']
cffi ['cffi']
cfgrib ['cf2cdm', 'cfgrib']
cftime ['cftime']
charset-normalizer ['charset_normalizer']
click ['click']
cligj ['cligj']
cloudpickle ['cloudpickle']
comm ['comm']
conflator ['conflator']
contourpy ['contourpy']
covjson-pydantic ['covjson_pydantic']
covjsonkit ['covjsonkit']
cycler ['cycler']
dask ['dask']
datapi ['datapi']
decorator ['decorator']
docstring_parser ['docstring_parser']
earthkit ['earthkit']
earthkit-data ['earthkit.data']
earthkit-geo ['earthkit.geo']
earthkit-meteo ['earthkit.meteo']
earthkit-plots ['earthkit.plots']
earthkit-plots-default-styles ['default']
earthkit-regrid ['earthkit.regrid']
earthkit-time ['earthkit.time']
earthkit-transforms ['earthkit.transforms']
eccodes ['eccodes', 'gribapi']
ecmwf-api-client ['ecmwfapi']
ecmwf-opendata ['ecmwf.opendata']
entrypoints ['entrypoints']
executing ['executing']
filelock ['filelock']
findlibs ['findlibs']
fiona ['fiona']
flexcache ['flexcache']
flexparser ['flexparser']
fonttools ['fontTools']
fsspec ['fsspec']
future ['future', 'libfuturize', 'libpasteurize', 'past']
geopandas ['geopandas']
h5py ['h5py']
hashlib []
hda ['hda']
idna ['idna']
ipykernel ['ipykernel']
ipython ['IPython']
ipywidgets ['ipywidgets']
jedi ['jedi']
jsonschema ['jsonschema']
jsonschema-specifications ['jsonschema_specifications']
jupyterlab_widgets ['jupyterlab_widgets']
jupyterlite-cors ['jupyterlite_cors']
jupyterlite-preload ['jupyterlite_preload']
kiwisolver ['kiwisolver']
locket ['locket']
lru-dict ['lru']
lzma ['lzma']
matplotlib ['matplotlib', 'mpl_toolkits.axes_grid1', 'mpl_toolkits.axisartist', 'mpl_toolkits.mplot3d', 'pylab']
matplotlib-inline ['matplotlib_inline']
matplotlib-pyodide ['matplotlib_pyodide']
micropip ['micropip']
multiurl ['multiurl']
netCDF4 ['netCDF4']
numpy ['numpy']
orjson ['orjson']
packaging ['packaging']
pandas ['pandas']
parso ['parso']
partd ['partd']
pdbufr ['pdbufr']
pillow ['PIL']
piplite ['piplite']
pkgconfig ['pkgconfig']
platformdirs ['platformdirs']
plotly ['_plotly_future_', '_plotly_utils', 'jupyterlab_plotly', 'plotly']
polytope-client ['polytope']
prompt_toolkit ['prompt_toolkit']
pure_eval ['pure_eval']
pyarrow ['pyarrow']
pycparser ['pycparser']
pydantic ['pydantic']
pydantic_core ['pydantic_core']
pydoc_data ['pydoc_data']
pyodc ['pyodc']
pyodide-kernel ['pyodide_kernel']
pyodide-unix-timezones ['unix_timezones']
pyodide_http ['pyodide_http']
pyogrio ['pyogrio']
pyparsing ['pyparsing']
pyproj ['pyproj']
pyrsistent ['_pyrsistent_version', 'pyrsistent']
pyshp ['shapefile']
python-dateutil ['dateutil']
pytz ['pytz']
rasterio ['rasterio']
referencing ['referencing']
requests ['requests']
rich ['rich']
rich-argparse ['rich_argparse']
rpds-py ['rpds']
scipy ['scipy']
setuptools ['_distutils_hack', 'pkg_resources', 'setuptools']
shapely ['shapely']
six ['six']
sqlite3 ['sqlite3']
ssl ['ssl']
stack-data ['stack_data']
toolz ['tlz', 'toolz']
tqdm ['tqdm']
traitlets ['traitlets']
typing_extensions ['typing_extensions']
tzdata ['tzdata']
uncertainties ['uncertainties']
urllib3 ['urllib3']
wcwidth ['wcwidth']
widgetsnbextension ['widgetsnbextension']
xarray ['xarray']

The results seem to be quite good :)

juntyr avatar Feb 14 '25 13:02 juntyr

Thanks for opening the issue. Yes, I totally agree that using top_level.txt file is a bad option (https://github.com/pyodide/pyodide/pull/3006), and we should replace it with other methods.

Combining the two approaches that you mentioned sounds reasonable to me. We also have a similar logic (iterating through the package directory and finding Python files) in pyodide-build, so probably you can take a look too.

ryanking13 avatar Feb 15 '25 07:02 ryanking13

Thank you for these links! I further adapted my code a bit so that it works for the cpython modules and for namespace packages. Unfortunately, Pyodide needs better handling of namespace packages than setuptools, since just giving the top level import (which top-level.txt does) is insufficient: when we parse the imports to generate the map from imports to packages to load, several namespace packages can fight over the top-level and you end up in a situation where import namespace imports a random sub-package.

juntyr avatar Feb 17 '25 08:02 juntyr

Here's what I have now:

def get_imports_for_package(p: str) -> list[str]:
    def valid_package_name(n: str) -> bool:
        return all(invalid_chr not in n for invalid_chr in ".- ")

    imports = set()

    tree = dict()
    for f in importlib.metadata.files(p):
        # ignore special folders
        if Path(f.parts[0]).suffix in [".libs", ".dist-info", ".data"]:
            continue

        # include top-level single-file packages
        if len(f.parts) == 1 and f.suffix in [".py", ".pyc", ".so"]:
            stem = f.name.split('.')[0] if f.suffix == ".so" else f.stem
            if valid_package_name(stem):
                imports.add(stem)
                continue

        # build a tree of all other files
        t = tree
        for r in f.parts:
            if t.get(r, None) is None:
                t[r] = dict()
            t = t[r]

    # extract folders that only have folders but no files as children,
    #  these are package candidates
    queue = [
        ([k], t) for k, t in tree.items()
        if len(t) > 0 and valid_package_name(k)
    ]
    while len(queue) > 0:
        ps, tree = queue.pop()
        imports.add('.'.join(ps))

        is_package = True

        add_to_queue = []
        for k, t in tree.items():
            if len(t) > 0:
                if valid_package_name(k):
                    add_to_queue.append((ps + [k], t))
            else:
                is_package = False

        if is_package:
            queue += add_to_queue

    # remove prefixes from the list
    new_imports = []
    for i in imports:
        if not any(j.startswith(f"{i}.") for j in imports if j != i):
            new_imports.append(i)

    return new_imports

juntyr avatar Feb 17 '25 08:02 juntyr

Thanks @juntyr. Feel free to open a PR when it is ready, then I'll start reviewing the code.

ryanking13 avatar Feb 18 '25 07:02 ryanking13