Skip to content

Commit

Permalink
Fix bundling of .so files (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
rth authored Oct 8, 2023
1 parent 506921f commit 85630db
Show file tree
Hide file tree
Showing 7 changed files with 254 additions and 127 deletions.
11 changes: 1 addition & 10 deletions examples/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,7 @@ def gen_all_examples():
for path in Path("examples").glob("*"):
if path.is_dir() and (path / "requirements.txt").exists():
path = path.resolve()
if path.name in ["scikit-learn", "scipy"]:
yield pytest.param(
path,
marks=pytest.mark.xfail(
reason="Known issue with .so loading in scipy"
),
id=path.name,
)
else:
yield pytest.param(path, id=path.name)
yield pytest.param(path, id=path.name)


BASE_DIR = Path(__file__).parents[1]
Expand Down
6 changes: 3 additions & 3 deletions pyodide_pack/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def match_suffix(file_paths: list[str], suffix: str) -> str | None:
# Adapted from pyodide conftest.py


def run_web_server(q, log_filepath, dist_dir):
def run_web_server(q: multiprocessing.Queue, log_filepath, dist_dir):
"""Start the HTTP web server
Parameters
Expand Down Expand Up @@ -79,7 +79,7 @@ def end_headers(self):

with socketserver.TCPServer(("", 0), Handler) as httpd:
host, port = httpd.server_address
print(f"Starting webserver at http://{host}:{port}")
print(f"Starting webserver at http://{host}:{port}") # type: ignore[str-bytes-safe]
httpd.server_name = "test-server" # type: ignore[attr-defined]
httpd.server_port = port # type: ignore[attr-defined]
q.put(port)
Expand All @@ -88,7 +88,7 @@ def end_headers(self):


@contextmanager
def spawn_web_server(dist_dir):
def spawn_web_server(dist_dir: str):
tmp_dir = tempfile.mkdtemp()
log_path = Path(tmp_dir) / "http-server.log"
q: multiprocessing.Queue[str] = multiprocessing.Queue()
Expand Down
97 changes: 14 additions & 83 deletions pyodide_pack/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import fnmatch
import gzip
import json
import os
import shutil
import sys
import zipfile
Expand All @@ -16,13 +13,11 @@

from pyodide_pack._utils import (
_get_packages_from_lockfile,
match_suffix,
spawn_web_server,
)
from pyodide_pack.archive import ArchiveFile
from pyodide_pack.dynamic_lib import DynamicLib
from pyodide_pack.runners.node import NodeRunner
from pyodide_pack.runtime_detection import RuntimeResults
from pyodide_pack.runtime_detection import PackageBundler, RuntimeResults

ROOT_DIR = Path(__file__).parents[1]

Expand Down Expand Up @@ -52,7 +47,9 @@ def main(
applications.
"""
console = Console()
console.print(f"Running [bold]pyodide pack[/bold] on [bold]{example_path}[/bold]")
console.print(
f"Running [bold]pyodide pack[/bold] on [bold]{example_path}[/bold] in Node.js"
)

if requirement_path is None:
requirement_path = example_path.parent / "requirements.txt"
Expand All @@ -75,7 +72,6 @@ def main(
code=code, packages=requirements, output_path="results.json"
)
with NodeRunner(js_template_path, ROOT_DIR, **js_template_kwargs) as runner:
console.print("Running the input code in Node.js to detect used modules..\n")
t0 = perf_counter()
runner.run()
console.print(
Expand All @@ -85,7 +81,7 @@ def main(
db = RuntimeResults.from_json(runner.tmp_path / "results.json")

if write_debug_map:
Path("./debug-map.json").write_text(json.dumps(db, indent=2))
db.to_json(Path("./debug-map.json"))

package_dir = ROOT_DIR / "node_modules" / "pyodide"

Expand All @@ -101,23 +97,13 @@ def main(
stdlib_archive = ArchiveFile(package_dir / "python_stdlib.zip", name="stdlib")
stdlib_stripped_path = Path("python_stdlib_stripped.zip")

console.print(
f"Using stdlib ({len(stdlib_archive.namelist())} files) with a total size "
f"of {stdlib_archive.total_size(compressed=True)/1e6:.2f} MB."
)

packages_size = sum(el.total_size(compressed=False) for el in packages.values())
packages_size_gzip = sum(el.total_size(compressed=True) for el in packages.values())
console.print(
f"Detected [bold]{len(packages)}[/bold] dependencies with a "
f"total size of {packages_size_gzip/1e6:.2f} MB "
f"(uncompressed: {packages_size/1e6:.2f} MB)"
)
if db["opened_file_names"]:
console.print(
f"In total {len(db['opened_file_names'])} files and "
f"{len(db['find_object_calls'])} dynamic libraries were accessed."
)
total_initial_size = packages_size_gzip + stdlib_archive.total_size(compressed=True)
console.print(
f"Total initial size (stdlib + dependencies): {total_initial_size/1e6:.2f} MB"
Expand All @@ -140,11 +126,7 @@ def main(
) as fh_out, Live(table) as live:
imported_paths = db.get_imported_paths(strip_prefix=db.stdlib_prefix)
stdlib_archive_stripped = stdlib_archive.filter_to_zip(
stdlib_stripped_path,
# Include imported stdlib modules and all pyodide modules
# Some modules are used when loading the bundle (e.g. json)
func=lambda name: name in imported_paths
or any(prefix in name for prefix in ["pyodide", "json", "cp437"]),
stdlib_stripped_path, func=lambda name: name in imported_paths
)

msg_0 = "0"
Expand All @@ -162,68 +144,17 @@ def main(
# Sort keys for reproducibility
in_file_names = sorted(ar.namelist())

stats = {
"py_in": 0,
"so_in": 0,
"py_out": 0,
"so_out": 0,
"fh_out": 0,
"size_out": 0,
"size_gzip_out": 0,
}
bundler = PackageBundler(db)
for in_file_name in in_file_names:
match Path(in_file_name).suffix:
case ".py":
stats["py_in"] += 1
case ".so":
stats["so_in"] += 1

out_file_name = None
if out_file_name := match_suffix(db["opened_file_names"], in_file_name):
stats["py_out"] += 1
elif out_file_name := match_suffix(
list(db["dynamic_libs_map"].keys()), in_file_name
):
stats["so_out"] += 1
# Get the dynamic library path while preserving order
# also determine if it's a shared library or not from
# the given package
dll = db["dynamic_libs_map"][out_file_name]
dll.shared = pyodide_lock.packages[ar.name].sharedlibrary
dynamic_libs.append(dll)

if (
out_file_name is None
and include_paths is not None
and any(
fnmatch.fnmatch(in_file_name, pattern)
for pattern in include_paths.split(",")
)
):
# TODO: this is hack and should be done better
out_file_name = os.path.join(
"/lib/python3.10/site-utils", in_file_name
)
match Path(in_file_name).suffix:
case ".py":
stats["py_out"] += 1
case ".so":
stats["so_out"] += 1
# Manually included dynamic libraries are going to be loaded first
dll = DynamicLib(out_file_name, load_order=-1000)
dll.shared = pyodide_lock.packages[ar.name].sharedlibrary
dynamic_libs.append(dll)
out_file_name = bundler.process_path(in_file_name)

if out_file_name is not None:
stats["fh_out"] += 1
stream = ar.read(in_file_name)
if stream is not None:
stats["size_out"] += len(stream)
stats["size_gzip_out"] += len(gzip.compress(stream))
# File paths starting with / fails to get correctly extracted
# in extract_archive in Pyodide
with fh_out.open(out_file_name.lstrip("/"), "w") as fh:
fh.write(stream)
bundler.copy_between_zip_files(
in_file_name, out_file_name, ar, fh_out
)
dynamic_libs.extend(bundler.dynamic_libs)

stats = bundler.stats

msg_0 = f"{idx+1}"
msg_1 = ar.file_path.name
Expand Down
123 changes: 100 additions & 23 deletions pyodide_pack/js/discovery.js
Original file line number Diff line number Diff line change
@@ -1,23 +1,94 @@

async function main() {
const { loadPyodide } = require("pyodide");
let fs = await import("fs");

let pyodide = await loadPyodide()
let file_list = [];
const open_orig = pyodide._module.FS.open;
// Monkeypatch FS.open
function patchFSopen(pyodide, fileList) {
// Record FS.open
// Note: we can't use FS.trackingDelegate since we want this to work without
// -sFS_DEBUG
const openOrig = pyodide._module.FS.open;
pyodide._module.FS.open = function (path, flags, mode, fd_start, fd_end) {
// Read-only flag is even
// https://github.com/emscripten-core/emscripten/blob/e8f25f84933a7973ad1a4e32084a8bf169d67d35/tests/fs/test_trackingdelegate.c#L18
// Here we only keep files in read mode.
if (flags % 2 == 0) {
file_list.push(path);
fileList.push(path);
}
return open_orig(path, flags, mode, fd_start, fd_end);
return openOrig(path, flags, mode, fd_start, fd_end);
};
}


function pathchLoadDynLib(pyodide, loadDynlibCalls) {
// Record loadDynlib calls
const loadDynlibOrig = pyodide._module.loadDynamicLibrary;
pyodide._module.loadDynamicLibrary = function(libName, flags, localScope, handle) {
loadDynlibCalls.push({path: libName, global: flags.global});
return loadDynlibOrig(libName, flags, localScope, handle)
}
}


function patchSymbolAccess(pyodide, accessedSymbols) {
// Record accessed symbols in pyodide._module.LDSO
function wrapSym(symName, sym, libName) {
if (!sym || typeof sym == "number") {
return sym;
}
return new Proxy(sym, {
get(sym, attr) {
if (attr === "stub") {
if (!(libName in accessedSymbols)) {
accessedSymbols[libName] = new Set();
}
accessedSymbols[libName].add(symName);
}
return Reflect.get(sym, attr);
},
});
}

function wrapExports(exports, libName) {
if (typeof exports !== "object") {
return exports;
}
return new Proxy(exports, {
get(exports, symName) {
const sym = Reflect.get(exports, symName);
if (libName in accessedSymbols && accessedSymbols[libName].has(symName)) {
return sym;
}
return wrapSym(symName, sym, libName);
},
});
}

function wrapLib(lib, libName) {
return new Proxy(lib, {
get(lib, sym) {
return wrapExports(Reflect.get(lib, sym), libName);
},
});
}
origLoadedlibs = pyodide._module.LDSO.loadedLibsByName;
pyodide._module.LDSO.loadedLibsByName = new Proxy(origLoadedlibs, {
set(libsByName, libName, lib) {
return Reflect.set(libsByName, libName, wrapLib(lib, libName));
},
});
}


async function main() {
const { loadPyodide } = require("pyodide");
let fs = await import("fs");

let pyodide = await loadPyodide()
let fileList = [];
patchFSopen(pyodide, fileList);

let loadDynlibCalls = [];
pathchLoadDynLib(pyodide, loadDynlibCalls);

var accessedSymbols = new Object();
patchSymbolAccess(pyodide, accessedSymbols);


try {
await pyodide.loadPackage({{packages}});
Expand All @@ -28,27 +99,33 @@ async function main() {
await micropip.install({{packages}});
}

// Monkeypatching findObject calls used in dlopen
let findObjectCalls = [];
const findObject_orig = pyodide._module.FS.findObject;
pyodide._module.FS.findObject = function(path, dontResolveLastLink) {
findObjectCalls.push(path);
return findObject_orig(path, dontResolveLastLink);
}
await pyodide.runPythonAsync(`
{{ code }}
`);
// Run code used in the loader
await pyodide.runPythonAsync(`
import pyodide.http
`);
// Look for loaded modules. That's the only way to access imported stdlib from the zipfile.
let sysModules = pyodide.runPython(
"import sys; {name: getattr(mod, '__file__', None) for name, mod in sys.modules.items()}"
).toJs({dict_converter : Object.fromEntries});

// Convert accessedSymbols to from Set to Array, so it can be serialized
accessedSymbolsOut = new Object();
for (const libName in accessedSymbols) {
accessedSymbolsOut[libName] = Array.from(accessedSymbols[libName]);
}

// writing the list of accessed files to disk
var obj = new Object();
obj.opened_file_names = file_list;
obj.loaded_packages = pyodide.loadedPackages;
obj.find_object_calls = findObjectCalls;
obj.sys_modules = sysModules;
var obj = {
opened_file_names: fileList,
loaded_packages: pyodide.loadedPackages,
load_dyn_lib_calls: loadDynlibCalls,
sys_modules: sysModules,
LDSO_loaded_libs_by_handle: pyodide._module.LDSO['loadedLibsByHandle'],
dl_accessed_symbols: accessedSymbolsOut,
};
if ("micropip" in pyodide.loadedPackages) {
obj.pyodide_lock = pyodide.pyimport("micropip").freeze();
}
Expand Down
2 changes: 1 addition & 1 deletion pyodide_pack/loader/pyodide_pack_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ async def setup():

for paths in Path("/bundle-so-list.txt").read_text().splitlines():
path, is_shared = paths.split(",")
await _module._api.loadDynlib(path, bool(is_shared))
await _module.API.loadDynlib(path, bool(is_shared))
Loading

0 comments on commit 85630db

Please sign in to comment.