diff --git a/examples/joblib/README.md b/examples/joblib/README.md new file mode 100644 index 0000000..35de97c --- /dev/null +++ b/examples/joblib/README.md @@ -0,0 +1,18 @@ +# joblib Examples + +Each sub-directory contains a self-contained example. The order in +which the examples are to appear is specified in `order.json` (an +array of directory names in the expected order). + +In each example directory you'll find: + +* `config.toml` - must conform to the specification outlined here: + https://docs.pyscript.net/latest/user-guide/configuration/ This is + parsed and ultimately turned into a JSON representation as part of + the package's API object. +* `setup.py` - Python code for contextual and environmental setup, + NOT SEEN BY THE END USER, but is run before the `code.py` code is + evaluated. Allows us to create useful (IPython) shims, avoid + repeating boilerplate and whatnot. +* `code.py` - the actual code added to the editor which forms the + practical example of using the package. diff --git a/examples/joblib/dump_and_load/code.py b/examples/joblib/dump_and_load/code.py new file mode 100644 index 0000000..4f821f7 --- /dev/null +++ b/examples/joblib/dump_and_load/code.py @@ -0,0 +1,66 @@ +# --------------------------------------------------------------------- +# joblib.dump / joblib.load: serialize Python objects (especially +# ones containing large NumPy arrays) to a single file. This is the +# canonical way to save trained models or precomputed datasets. +# +# See https://joblib.readthedocs.io/en/stable/persistence.html +# --------------------------------------------------------------------- + +import os +import numpy as np +import joblib + +rng = np.random.default_rng(7) + + +heading("Serializing objects with joblib.dump and joblib.load") +note( + "We'll build a small dictionary holding metadata and a couple " + "of NumPy arrays, save it to a file, then load it back and " + "compare. joblib.dump handles large arrays " + "efficiently and supports compression." +) + +# A made-up "model artifact": some metadata plus learned parameters. +artifact = { + "name": "linear-regressor", + "version": 3, + "feature_names": ["temperature", "humidity", "wind_speed"], + "weights": rng.normal(size=(3,)), + "training_samples": rng.normal(size=(1000, 3)), +} + +# Save uncompressed and with compression to compare file sizes. +joblib.dump(artifact, "artifact.joblib") +joblib.dump(artifact, "artifact.joblib.gz", compress=("gzip", 3)) + +uncompressed_size = os.path.getsize("artifact.joblib") +compressed_size = os.path.getsize("artifact.joblib.gz") +note( + f"Uncompressed file: {uncompressed_size:,} bytes. " + f"Gzip-compressed (level 3): {compressed_size:,} bytes." +) + +# Load the artifact back. joblib auto-detects the compression. +restored = joblib.load("artifact.joblib.gz") + +note( + f"Restored name: {restored['name']}, " + f"version {restored['version']}, " + f"features: {restored['feature_names']}." +) + +weights_match = np.array_equal(restored["weights"], artifact["weights"]) +samples_match = np.array_equal( + restored["training_samples"], artifact["training_samples"], +) +note( + f"Weights round-tripped exactly: {weights_match}. " + f"Training samples round-tripped exactly: " + f"{samples_match}." +) + +# joblib.hash gives a stable fingerprint for arbitrary Python objects, +# which is handy for cache keys and equality checks across processes. +fingerprint = joblib.hash(artifact) +note(f"Stable fingerprint of the artifact: {fingerprint}.") diff --git a/examples/joblib/dump_and_load/config.toml b/examples/joblib/dump_and_load/config.toml new file mode 100644 index 0000000..a99911e --- /dev/null +++ b/examples/joblib/dump_and_load/config.toml @@ -0,0 +1 @@ +packages = ["joblib", "numpy"] diff --git a/examples/joblib/dump_and_load/setup.py b/examples/joblib/dump_and_load/setup.py new file mode 100644 index 0000000..5945907 --- /dev/null +++ b/examples/joblib/dump_and_load/setup.py @@ -0,0 +1,17 @@ +"""Setup for the dump/load example. No IPython shim here.""" +import js +from pyscript import window, HTML, display as _display + +js.alert = window.alert + + +def display(*args, **kwargs): + return _display(*args, **kwargs, target=__pyscript_display_target__) + + +def heading(text, level=2): + display(HTML(f"{text}"), append=True) + + +def note(text): + display(HTML(f"

{text}

"), append=True) diff --git a/examples/joblib/memory_caching/code.py b/examples/joblib/memory_caching/code.py new file mode 100644 index 0000000..d4fd76c --- /dev/null +++ b/examples/joblib/memory_caching/code.py @@ -0,0 +1,69 @@ +""" +First look at joblib: cache expensive function results to disk +(in Pyodide, an in-memory virtual file system) so repeated calls +return instantly. + +See https://joblib.readthedocs.io/en/stable/memory.html +""" +from IPython.core.display import display, HTML + +import time +import numpy as np +from joblib import Memory + + +# A Memory object is the entry point for caching. The `location` +# is a directory where results are persisted; in Pyodide this is +# the in-browser virtual file system, so caches survive within a +# session. +memory = Memory(location="./joblib_cache", verbose=0) + + +@memory.cache +def slow_square_sum(n): + """Pretend-expensive computation: sum of squares up to n.""" + # Simulate a costly step so the cache benefit is obvious. + time.sleep(0.5) + arr = np.arange(n, dtype=np.int64) + return int((arr * arr).sum()) + + +heading("Caching with joblib.Memory") +note( + "We decorate slow_square_sum with " + "@memory.cache. The first call computes and " + "stores the result; later calls with the same argument are " + "served from the cache." +) + +# First call: actually computes (and writes to the cache). +start = time.perf_counter() +result_first = slow_square_sum(200_000) +first_elapsed = time.perf_counter() - start + +# Second call with the same input: hits the cache. +start = time.perf_counter() +result_cached = slow_square_sum(200_000) +cached_elapsed = time.perf_counter() - start + +# Different input: computes again, populating a new cache entry. +start = time.perf_counter() +result_other = slow_square_sum(50_000) +other_elapsed = time.perf_counter() - start + +note( + f"First call (n=200,000): result={result_first:,}, " + f"took {first_elapsed:.3f}s." +) +note( + f"Repeat call (n=200,000): result={result_cached:,}, " + f"took {cached_elapsed:.3f}s (cache hit)." +) +note( + f"New input (n=50,000): result={result_other:,}, " + f"took {other_elapsed:.3f}s." +) + +# You can wipe the cache when you want to force recomputation. +memory.clear(warn=False) +note("Called memory.clear() to remove all cached entries.") diff --git a/examples/joblib/memory_caching/config.toml b/examples/joblib/memory_caching/config.toml new file mode 100644 index 0000000..a99911e --- /dev/null +++ b/examples/joblib/memory_caching/config.toml @@ -0,0 +1 @@ +packages = ["joblib", "numpy"] diff --git a/examples/joblib/memory_caching/setup.py b/examples/joblib/memory_caching/setup.py new file mode 100644 index 0000000..84faac4 --- /dev/null +++ b/examples/joblib/memory_caching/setup.py @@ -0,0 +1,40 @@ +""" +Shim IPython's display API onto PyScript so example code written in a +Jupyter/IPython idiom runs unmodified in the browser. +""" + +import sys +import types +import js +from pyscript import window, HTML, display as _display + +js.alert = window.alert + + +def display(*args, **kwargs): + return _display( + *args, **kwargs, target=__pyscript_display_target__, + ) + + +ipython = types.ModuleType("IPython") +core = types.ModuleType("IPython.core") +core_display = types.ModuleType("IPython.core.display") +core_display.display = display +core_display.HTML = HTML +ipython.core = core +core.display = core_display +ipython.get_ipython = lambda: None +ipython.display = core_display +sys.modules["IPython"] = ipython +sys.modules["IPython.core"] = core +sys.modules["IPython.core.display"] = core_display +sys.modules["IPython.display"] = core_display + + +def heading(text, level=2): + display(HTML(f"{text}"), append=True) + + +def note(text): + display(HTML(f"

{text}

"), append=True) diff --git a/examples/joblib/order.json b/examples/joblib/order.json new file mode 100644 index 0000000..d976fda --- /dev/null +++ b/examples/joblib/order.json @@ -0,0 +1,5 @@ +[ + "memory_caching", + "parallel_loops", + "dump_and_load" +] diff --git a/examples/joblib/parallel_loops/code.py b/examples/joblib/parallel_loops/code.py new file mode 100644 index 0000000..9d783d2 --- /dev/null +++ b/examples/joblib/parallel_loops/code.py @@ -0,0 +1,77 @@ +# --------------------------------------------------------------------- +# joblib.Parallel: write a parallel loop with the same shape as a +# sequential one. The recipe is: +# +# Parallel(n_jobs=...)(delayed(func)(arg) for arg in args) +# +# `delayed` captures the call without invoking it, and `Parallel` +# dispatches the captured calls. See: +# https://joblib.readthedocs.io/en/stable/parallel.html +# --------------------------------------------------------------------- +import time +import math +import numpy as np +import matplotlib.pyplot as plt +from joblib import Parallel, delayed + +rng = np.random.default_rng(0) + + +heading("Parallel loops with joblib") +note( + "We'll estimate π with a Monte Carlo simulation, splitting " + "the work across several batches. Each batch throws random " + "darts at the unit square and counts those landing inside the " + "quarter circle of radius 1." +) + + +def estimate_pi_batch(n_samples, seed): + """Return 4 * (fraction of points inside the unit circle).""" + local_rng = np.random.default_rng(seed) + xs = local_rng.random(n_samples) + ys = local_rng.random(n_samples) + inside = int(((xs * xs + ys * ys) <= 1.0).sum()) + return 4.0 * inside / n_samples + + +# Eight batches of 50,000 samples, each with its own seed. +batch_size = 50_000 +seeds = list(range(8)) + +# Sequential baseline: a plain list comprehension. +start = time.perf_counter() +sequential_estimates = [estimate_pi_batch(batch_size, s) for s in seeds] +sequential_elapsed = time.perf_counter() - start + +# Parallel version: same shape, wrapped in Parallel/delayed. +# n_jobs=2 keeps the demo lightweight; -1 would use all CPUs. +start = time.perf_counter() +parallel_estimates = Parallel(n_jobs=2)( + delayed(estimate_pi_batch)(batch_size, s) for s in seeds +) +parallel_elapsed = time.perf_counter() - start + +combined_pi = float(np.mean(parallel_estimates)) +note( + f"Combined estimate of π from {len(seeds)} batches: " + f"{combined_pi:.5f} " + f"(error vs math.pi: {abs(combined_pi - math.pi):.5f})." +) +note( + f"Sequential loop: {sequential_elapsed:.3f}s. " + f"Parallel loop: {parallel_elapsed:.3f}s." +) + +# Plot the per-batch estimates against the true value. +fig, ax = plt.subplots(figsize=(8, 4)) +ax.plot(seeds, parallel_estimates, "o-", color="steelblue", + label="Per-batch estimate") +ax.axhline(math.pi, color="crimson", linestyle="--", + label="math.pi") +ax.set_xlabel("Batch (seed)") +ax.set_ylabel("Estimate of \u03c0") +ax.set_title("Monte Carlo estimates of \u03c0 across parallel batches") +ax.legend() +fig.tight_layout() +display(fig, append=True) diff --git a/examples/joblib/parallel_loops/config.toml b/examples/joblib/parallel_loops/config.toml new file mode 100644 index 0000000..da6fcc0 --- /dev/null +++ b/examples/joblib/parallel_loops/config.toml @@ -0,0 +1 @@ +packages = ["joblib", "numpy", "matplotlib"] diff --git a/examples/joblib/parallel_loops/setup.py b/examples/joblib/parallel_loops/setup.py new file mode 100644 index 0000000..03bf429 --- /dev/null +++ b/examples/joblib/parallel_loops/setup.py @@ -0,0 +1,18 @@ +"""Setup for the Parallel example. No IPython shim here.""" +import js +from pyscript import window, HTML, display as _display + +js.alert = window.alert + + +def display(*args, **kwargs): + return _display(*args, **kwargs, target=__pyscript_display_target__) + + +def heading(text, level=2): + display(HTML(f"{text}"), append=True) + + +def note(text): + display(HTML(f"

{text}

"), append=True) +