diff --git a/examples/joblib/README.md b/examples/joblib/README.md
new file mode 100644
index 0000000..35de97c
--- /dev/null
+++ b/examples/joblib/README.md
@@ -0,0 +1,18 @@
+# joblib Examples
+
+Each sub-directory contains a self-contained example. The order in
+which the examples are to appear is specified in `order.json` (an
+array of directory names in the expected order).
+
+In each example directory you'll find:
+
+* `config.toml` - must conform to the specification outlined here:
+ https://docs.pyscript.net/latest/user-guide/configuration/ This is
+ parsed and ultimately turned into a JSON representation as part of
+ the package's API object.
+* `setup.py` - Python code for contextual and environmental setup,
+ NOT SEEN BY THE END USER, but is run before the `code.py` code is
+ evaluated. Allows us to create useful (IPython) shims, avoid
+ repeating boilerplate and whatnot.
+* `code.py` - the actual code added to the editor which forms the
+ practical example of using the package.
diff --git a/examples/joblib/dump_and_load/code.py b/examples/joblib/dump_and_load/code.py
new file mode 100644
index 0000000..4f821f7
--- /dev/null
+++ b/examples/joblib/dump_and_load/code.py
@@ -0,0 +1,66 @@
+# ---------------------------------------------------------------------
+# joblib.dump / joblib.load: serialize Python objects (especially
+# ones containing large NumPy arrays) to a single file. This is the
+# canonical way to save trained models or precomputed datasets.
+#
+# See https://joblib.readthedocs.io/en/stable/persistence.html
+# ---------------------------------------------------------------------
+
+import os
+import numpy as np
+import joblib
+
+rng = np.random.default_rng(7)
+
+
+heading("Serializing objects with joblib.dump and joblib.load")
+note(
+ "We'll build a small dictionary holding metadata and a couple "
+ "of NumPy arrays, save it to a file, then load it back and "
+ "compare. joblib.dump handles large arrays "
+ "efficiently and supports compression."
+)
+
+# A made-up "model artifact": some metadata plus learned parameters.
+artifact = {
+ "name": "linear-regressor",
+ "version": 3,
+ "feature_names": ["temperature", "humidity", "wind_speed"],
+ "weights": rng.normal(size=(3,)),
+ "training_samples": rng.normal(size=(1000, 3)),
+}
+
+# Save uncompressed and with compression to compare file sizes.
+joblib.dump(artifact, "artifact.joblib")
+joblib.dump(artifact, "artifact.joblib.gz", compress=("gzip", 3))
+
+uncompressed_size = os.path.getsize("artifact.joblib")
+compressed_size = os.path.getsize("artifact.joblib.gz")
+note(
+ f"Uncompressed file: {uncompressed_size:,} bytes. "
+ f"Gzip-compressed (level 3): {compressed_size:,} bytes."
+)
+
+# Load the artifact back. joblib auto-detects the compression.
+restored = joblib.load("artifact.joblib.gz")
+
+note(
+ f"Restored name: {restored['name']}, "
+ f"version {restored['version']}, "
+ f"features: {restored['feature_names']}."
+)
+
+weights_match = np.array_equal(restored["weights"], artifact["weights"])
+samples_match = np.array_equal(
+ restored["training_samples"], artifact["training_samples"],
+)
+note(
+ f"Weights round-tripped exactly: {weights_match}. "
+ f"Training samples round-tripped exactly: "
+ f"{samples_match}."
+)
+
+# joblib.hash gives a stable fingerprint for arbitrary Python objects,
+# which is handy for cache keys and equality checks across processes.
+fingerprint = joblib.hash(artifact)
+note(f"Stable fingerprint of the artifact: {fingerprint}.")
diff --git a/examples/joblib/dump_and_load/config.toml b/examples/joblib/dump_and_load/config.toml
new file mode 100644
index 0000000..a99911e
--- /dev/null
+++ b/examples/joblib/dump_and_load/config.toml
@@ -0,0 +1 @@
+packages = ["joblib", "numpy"]
diff --git a/examples/joblib/dump_and_load/setup.py b/examples/joblib/dump_and_load/setup.py
new file mode 100644
index 0000000..5945907
--- /dev/null
+++ b/examples/joblib/dump_and_load/setup.py
@@ -0,0 +1,17 @@
+"""Setup for the dump/load example. No IPython shim here."""
+import js
+from pyscript import window, HTML, display as _display
+
+js.alert = window.alert
+
+
+def display(*args, **kwargs):
+ return _display(*args, **kwargs, target=__pyscript_display_target__)
+
+
+def heading(text, level=2):
+ display(HTML(f"
{text}
"), append=True) diff --git a/examples/joblib/memory_caching/code.py b/examples/joblib/memory_caching/code.py new file mode 100644 index 0000000..d4fd76c --- /dev/null +++ b/examples/joblib/memory_caching/code.py @@ -0,0 +1,69 @@ +""" +First look at joblib: cache expensive function results to disk +(in Pyodide, an in-memory virtual file system) so repeated calls +return instantly. + +See https://joblib.readthedocs.io/en/stable/memory.html +""" +from IPython.core.display import display, HTML + +import time +import numpy as np +from joblib import Memory + + +# A Memory object is the entry point for caching. The `location` +# is a directory where results are persisted; in Pyodide this is +# the in-browser virtual file system, so caches survive within a +# session. +memory = Memory(location="./joblib_cache", verbose=0) + + +@memory.cache +def slow_square_sum(n): + """Pretend-expensive computation: sum of squares up to n.""" + # Simulate a costly step so the cache benefit is obvious. + time.sleep(0.5) + arr = np.arange(n, dtype=np.int64) + return int((arr * arr).sum()) + + +heading("Caching with joblib.Memory") +note( + "We decorateslow_square_sum with "
+ "@memory.cache. The first call computes and "
+ "stores the result; later calls with the same argument are "
+ "served from the cache."
+)
+
+# First call: actually computes (and writes to the cache).
+start = time.perf_counter()
+result_first = slow_square_sum(200_000)
+first_elapsed = time.perf_counter() - start
+
+# Second call with the same input: hits the cache.
+start = time.perf_counter()
+result_cached = slow_square_sum(200_000)
+cached_elapsed = time.perf_counter() - start
+
+# Different input: computes again, populating a new cache entry.
+start = time.perf_counter()
+result_other = slow_square_sum(50_000)
+other_elapsed = time.perf_counter() - start
+
+note(
+ f"First call (n=200,000): result={result_first:,}, "
+ f"took {first_elapsed:.3f}s."
+)
+note(
+ f"Repeat call (n=200,000): result={result_cached:,}, "
+ f"took {cached_elapsed:.3f}s (cache hit)."
+)
+note(
+ f"New input (n=50,000): result={result_other:,}, "
+ f"took {other_elapsed:.3f}s."
+)
+
+# You can wipe the cache when you want to force recomputation.
+memory.clear(warn=False)
+note("Called memory.clear() to remove all cached entries.")
diff --git a/examples/joblib/memory_caching/config.toml b/examples/joblib/memory_caching/config.toml
new file mode 100644
index 0000000..a99911e
--- /dev/null
+++ b/examples/joblib/memory_caching/config.toml
@@ -0,0 +1 @@
+packages = ["joblib", "numpy"]
diff --git a/examples/joblib/memory_caching/setup.py b/examples/joblib/memory_caching/setup.py
new file mode 100644
index 0000000..84faac4
--- /dev/null
+++ b/examples/joblib/memory_caching/setup.py
@@ -0,0 +1,40 @@
+"""
+Shim IPython's display API onto PyScript so example code written in a
+Jupyter/IPython idiom runs unmodified in the browser.
+"""
+
+import sys
+import types
+import js
+from pyscript import window, HTML, display as _display
+
+js.alert = window.alert
+
+
+def display(*args, **kwargs):
+ return _display(
+ *args, **kwargs, target=__pyscript_display_target__,
+ )
+
+
+ipython = types.ModuleType("IPython")
+core = types.ModuleType("IPython.core")
+core_display = types.ModuleType("IPython.core.display")
+core_display.display = display
+core_display.HTML = HTML
+ipython.core = core
+core.display = core_display
+ipython.get_ipython = lambda: None
+ipython.display = core_display
+sys.modules["IPython"] = ipython
+sys.modules["IPython.core"] = core
+sys.modules["IPython.core.display"] = core_display
+sys.modules["IPython.display"] = core_display
+
+
+def heading(text, level=2):
+ display(HTML(f"{text}
"), append=True) diff --git a/examples/joblib/order.json b/examples/joblib/order.json new file mode 100644 index 0000000..d976fda --- /dev/null +++ b/examples/joblib/order.json @@ -0,0 +1,5 @@ +[ + "memory_caching", + "parallel_loops", + "dump_and_load" +] diff --git a/examples/joblib/parallel_loops/code.py b/examples/joblib/parallel_loops/code.py new file mode 100644 index 0000000..9d783d2 --- /dev/null +++ b/examples/joblib/parallel_loops/code.py @@ -0,0 +1,77 @@ +# --------------------------------------------------------------------- +# joblib.Parallel: write a parallel loop with the same shape as a +# sequential one. The recipe is: +# +# Parallel(n_jobs=...)(delayed(func)(arg) for arg in args) +# +# `delayed` captures the call without invoking it, and `Parallel` +# dispatches the captured calls. See: +# https://joblib.readthedocs.io/en/stable/parallel.html +# --------------------------------------------------------------------- +import time +import math +import numpy as np +import matplotlib.pyplot as plt +from joblib import Parallel, delayed + +rng = np.random.default_rng(0) + + +heading("Parallel loops with joblib") +note( + "We'll estimate π with a Monte Carlo simulation, splitting " + "the work across several batches. Each batch throws random " + "darts at the unit square and counts those landing inside the " + "quarter circle of radius 1." +) + + +def estimate_pi_batch(n_samples, seed): + """Return 4 * (fraction of points inside the unit circle).""" + local_rng = np.random.default_rng(seed) + xs = local_rng.random(n_samples) + ys = local_rng.random(n_samples) + inside = int(((xs * xs + ys * ys) <= 1.0).sum()) + return 4.0 * inside / n_samples + + +# Eight batches of 50,000 samples, each with its own seed. +batch_size = 50_000 +seeds = list(range(8)) + +# Sequential baseline: a plain list comprehension. +start = time.perf_counter() +sequential_estimates = [estimate_pi_batch(batch_size, s) for s in seeds] +sequential_elapsed = time.perf_counter() - start + +# Parallel version: same shape, wrapped in Parallel/delayed. +# n_jobs=2 keeps the demo lightweight; -1 would use all CPUs. +start = time.perf_counter() +parallel_estimates = Parallel(n_jobs=2)( + delayed(estimate_pi_batch)(batch_size, s) for s in seeds +) +parallel_elapsed = time.perf_counter() - start + +combined_pi = float(np.mean(parallel_estimates)) +note( + f"Combined estimate of π from {len(seeds)} batches: " + f"{combined_pi:.5f} " + f"(error vs math.pi: {abs(combined_pi - math.pi):.5f})." +) +note( + f"Sequential loop: {sequential_elapsed:.3f}s. " + f"Parallel loop: {parallel_elapsed:.3f}s." +) + +# Plot the per-batch estimates against the true value. +fig, ax = plt.subplots(figsize=(8, 4)) +ax.plot(seeds, parallel_estimates, "o-", color="steelblue", + label="Per-batch estimate") +ax.axhline(math.pi, color="crimson", linestyle="--", + label="math.pi") +ax.set_xlabel("Batch (seed)") +ax.set_ylabel("Estimate of \u03c0") +ax.set_title("Monte Carlo estimates of \u03c0 across parallel batches") +ax.legend() +fig.tight_layout() +display(fig, append=True) diff --git a/examples/joblib/parallel_loops/config.toml b/examples/joblib/parallel_loops/config.toml new file mode 100644 index 0000000..da6fcc0 --- /dev/null +++ b/examples/joblib/parallel_loops/config.toml @@ -0,0 +1 @@ +packages = ["joblib", "numpy", "matplotlib"] diff --git a/examples/joblib/parallel_loops/setup.py b/examples/joblib/parallel_loops/setup.py new file mode 100644 index 0000000..03bf429 --- /dev/null +++ b/examples/joblib/parallel_loops/setup.py @@ -0,0 +1,18 @@ +"""Setup for the Parallel example. No IPython shim here.""" +import js +from pyscript import window, HTML, display as _display + +js.alert = window.alert + + +def display(*args, **kwargs): + return _display(*args, **kwargs, target=__pyscript_display_target__) + + +def heading(text, level=2): + display(HTML(f"{text}
"), append=True) +