micropip.freeze imports field for PyPi dependencies is insufficient
micropip.freeze currently checks for the non-standard top_level.txt file to fill the imports field. This has a few issues:
- When the file is missing, no imports are included
- For namespace packages, the top-level imports is just the name of the namespace, but we want the imports field to name the package, e.g.
namespace.packageinstead
A brief online search didn't reveal any good solutions:
- https://docs.python.org/3/library/importlib.metadata.html#importlib.metadata.packages_distributions can provide a top-level -> packages mapping for already-imported packages (which is good enough for freezing), which would solve (1) but not (2)
- https://discuss.python.org/t/determining-top-level-import-package-names-programmatically/45055/2 suggests reading the
RECORDSfile to extract all non-special directories. I would suggest to go even one step further and extract the common prefix for all non-special directories so that we could also translatenamespace/packageintonamespace.package. To allow several adjacent namespace packages, perhaps the rule would be to look for directory prefixes until we find the first__init__.py
Here's what I was able to code up so far:
import importlib.metadata
from pathlib import Path
packages = set(
v for vs in importlib.metadata.packages_distributions().values() for v in vs
)
for p in sorted(packages):
files = importlib.metadata.files(p)
imports = set()
tree = dict()
for f in files:
# ignore special folders
if Path(f.parts[0]).suffix in [".libs", ".dist-info", ".data"]:
continue
# include top-level single-file packages
if len(f.parts) == 1 and f.suffix == ".py":
imports.add(f.stem)
continue
# build a tree of all other files
t = tree
for r in f.parts:
if t.get(r, None) is None:
t[r] = dict()
t = t[r]
# extract folders that only have folders but no files as children,
# these are package candidates
queue = [([k], t) for k, t in tree.items()]
while len(queue) > 0:
ps, tree = queue.pop()
if len(tree) == 0:
continue
imports.add('.'.join(ps))
is_package = True
add_to_queue = []
for k, t in tree.items():
if len(t) == 0:
is_package = False
add_to_queue.append((ps + [k], t))
if is_package:
queue += add_to_queue
# remove prefixes from the list
new_imports = []
for i in imports:
if not any(j.startswith(i) for j in imports if j != i):
new_imports.append(i)
print(p, sorted(new_imports))
For import earthkit, which is a package and a namespace, this prints the following:
Cartopy ['cartopy']
Jinja2 ['jinja2']
Markdown ['markdown']
MarkupSafe ['markupsafe']
Pint ['pint']
PyYAML ['_yaml', 'yaml']
Pygments ['pygments']
adjustText ['adjustText']
affine ['affine']
annotated-types ['annotated_types']
array_api_compat ['array_api_compat']
asttokens ['asttokens']
attrs ['attrs']
cdsapi ['cdsapi']
certifi ['certifi']
cffi ['cffi']
cfgrib ['cf2cdm', 'cfgrib']
cftime ['cftime']
charset-normalizer ['charset_normalizer']
click ['click']
cligj ['cligj']
cloudpickle ['cloudpickle']
comm ['comm']
conflator ['conflator']
contourpy ['contourpy']
covjson-pydantic ['covjson_pydantic']
covjsonkit ['covjsonkit']
cycler ['cycler']
dask ['dask']
datapi ['datapi']
decorator ['decorator']
docstring_parser ['docstring_parser']
earthkit ['earthkit']
earthkit-data ['earthkit.data']
earthkit-geo ['earthkit.geo']
earthkit-meteo ['earthkit.meteo']
earthkit-plots ['earthkit.plots']
earthkit-plots-default-styles ['default']
earthkit-regrid ['earthkit.regrid']
earthkit-time ['earthkit.time']
earthkit-transforms ['earthkit.transforms']
eccodes ['eccodes', 'gribapi']
ecmwf-api-client ['ecmwfapi']
ecmwf-opendata ['ecmwf.opendata']
entrypoints ['entrypoints']
executing ['executing']
filelock ['filelock']
findlibs ['findlibs']
fiona ['fiona']
flexcache ['flexcache']
flexparser ['flexparser']
fonttools ['fontTools']
fsspec ['fsspec']
future ['future', 'libfuturize', 'libpasteurize', 'past']
geopandas ['geopandas']
h5py ['h5py']
hashlib []
hda ['hda']
idna ['idna']
ipykernel ['ipykernel']
ipython ['IPython']
ipywidgets ['ipywidgets']
jedi ['jedi']
jsonschema ['jsonschema']
jsonschema-specifications ['jsonschema_specifications']
jupyterlab_widgets ['jupyterlab_widgets']
jupyterlite-cors ['jupyterlite_cors']
jupyterlite-preload ['jupyterlite_preload']
kiwisolver ['kiwisolver']
locket ['locket']
lru-dict ['lru']
lzma ['lzma']
matplotlib ['matplotlib', 'mpl_toolkits.axes_grid1', 'mpl_toolkits.axisartist', 'mpl_toolkits.mplot3d', 'pylab']
matplotlib-inline ['matplotlib_inline']
matplotlib-pyodide ['matplotlib_pyodide']
micropip ['micropip']
multiurl ['multiurl']
netCDF4 ['netCDF4']
numpy ['numpy']
orjson ['orjson']
packaging ['packaging']
pandas ['pandas']
parso ['parso']
partd ['partd']
pdbufr ['pdbufr']
pillow ['PIL']
piplite ['piplite']
pkgconfig ['pkgconfig']
platformdirs ['platformdirs']
plotly ['_plotly_future_', '_plotly_utils', 'jupyterlab_plotly', 'plotly']
polytope-client ['polytope']
prompt_toolkit ['prompt_toolkit']
pure_eval ['pure_eval']
pyarrow ['pyarrow']
pycparser ['pycparser']
pydantic ['pydantic']
pydantic_core ['pydantic_core']
pydoc_data ['pydoc_data']
pyodc ['pyodc']
pyodide-kernel ['pyodide_kernel']
pyodide-unix-timezones ['unix_timezones']
pyodide_http ['pyodide_http']
pyogrio ['pyogrio']
pyparsing ['pyparsing']
pyproj ['pyproj']
pyrsistent ['_pyrsistent_version', 'pyrsistent']
pyshp ['shapefile']
python-dateutil ['dateutil']
pytz ['pytz']
rasterio ['rasterio']
referencing ['referencing']
requests ['requests']
rich ['rich']
rich-argparse ['rich_argparse']
rpds-py ['rpds']
scipy ['scipy']
setuptools ['_distutils_hack', 'pkg_resources', 'setuptools']
shapely ['shapely']
six ['six']
sqlite3 ['sqlite3']
ssl ['ssl']
stack-data ['stack_data']
toolz ['tlz', 'toolz']
tqdm ['tqdm']
traitlets ['traitlets']
typing_extensions ['typing_extensions']
tzdata ['tzdata']
uncertainties ['uncertainties']
urllib3 ['urllib3']
wcwidth ['wcwidth']
widgetsnbextension ['widgetsnbextension']
xarray ['xarray']
The results seem to be quite good :)
Thanks for opening the issue. Yes, I totally agree that using top_level.txt file is a bad option (https://github.com/pyodide/pyodide/pull/3006), and we should replace it with other methods.
Combining the two approaches that you mentioned sounds reasonable to me. We also have a similar logic (iterating through the package directory and finding Python files) in pyodide-build, so probably you can take a look too.
Thank you for these links! I further adapted my code a bit so that it works for the cpython modules and for namespace packages. Unfortunately, Pyodide needs better handling of namespace packages than setuptools, since just giving the top level import (which top-level.txt does) is insufficient: when we parse the imports to generate the map from imports to packages to load, several namespace packages can fight over the top-level and you end up in a situation where import namespace imports a random sub-package.
Here's what I have now:
def get_imports_for_package(p: str) -> list[str]:
def valid_package_name(n: str) -> bool:
return all(invalid_chr not in n for invalid_chr in ".- ")
imports = set()
tree = dict()
for f in importlib.metadata.files(p):
# ignore special folders
if Path(f.parts[0]).suffix in [".libs", ".dist-info", ".data"]:
continue
# include top-level single-file packages
if len(f.parts) == 1 and f.suffix in [".py", ".pyc", ".so"]:
stem = f.name.split('.')[0] if f.suffix == ".so" else f.stem
if valid_package_name(stem):
imports.add(stem)
continue
# build a tree of all other files
t = tree
for r in f.parts:
if t.get(r, None) is None:
t[r] = dict()
t = t[r]
# extract folders that only have folders but no files as children,
# these are package candidates
queue = [
([k], t) for k, t in tree.items()
if len(t) > 0 and valid_package_name(k)
]
while len(queue) > 0:
ps, tree = queue.pop()
imports.add('.'.join(ps))
is_package = True
add_to_queue = []
for k, t in tree.items():
if len(t) > 0:
if valid_package_name(k):
add_to_queue.append((ps + [k], t))
else:
is_package = False
if is_package:
queue += add_to_queue
# remove prefixes from the list
new_imports = []
for i in imports:
if not any(j.startswith(f"{i}.") for j in imports if j != i):
new_imports.append(i)
return new_imports
Thanks @juntyr. Feel free to open a PR when it is ready, then I'll start reviewing the code.