Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions examples/joblib/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# joblib Examples

Each sub-directory contains a self-contained example. The order in
which the examples are to appear is specified in `order.json` (an
array of directory names in the expected order).

In each example directory you'll find:

* `config.toml` - must conform to the specification outlined here:
https://docs.pyscript.net/latest/user-guide/configuration/ This is
parsed and ultimately turned into a JSON representation as part of
the package's API object.
* `setup.py` - Python code for contextual and environmental setup,
NOT SEEN BY THE END USER, but is run before the `code.py` code is
evaluated. Allows us to create useful (IPython) shims, avoid
repeating boilerplate and whatnot.
* `code.py` - the actual code added to the editor which forms the
practical example of using the package.
66 changes: 66 additions & 0 deletions examples/joblib/dump_and_load/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# ---------------------------------------------------------------------
# joblib.dump / joblib.load: serialize Python objects (especially
# ones containing large NumPy arrays) to a single file. This is the
# canonical way to save trained models or precomputed datasets.
#
# See https://joblib.readthedocs.io/en/stable/persistence.html
# ---------------------------------------------------------------------

import os
import numpy as np
import joblib

rng = np.random.default_rng(7)


heading("Serializing objects with joblib.dump and joblib.load")
note(
"We'll build a small dictionary holding metadata and a couple "
"of NumPy arrays, save it to a file, then load it back and "
"compare. <code>joblib.dump</code> handles large arrays "
"efficiently and supports compression."
)

# A made-up "model artifact": some metadata plus learned parameters.
artifact = {
"name": "linear-regressor",
"version": 3,
"feature_names": ["temperature", "humidity", "wind_speed"],
"weights": rng.normal(size=(3,)),
"training_samples": rng.normal(size=(1000, 3)),
}

# Save uncompressed and with compression to compare file sizes.
joblib.dump(artifact, "artifact.joblib")
joblib.dump(artifact, "artifact.joblib.gz", compress=("gzip", 3))

uncompressed_size = os.path.getsize("artifact.joblib")
compressed_size = os.path.getsize("artifact.joblib.gz")
note(
f"Uncompressed file: <strong>{uncompressed_size:,}</strong> bytes. "
f"Gzip-compressed (level 3): <strong>{compressed_size:,}</strong> bytes."
)

# Load the artifact back. joblib auto-detects the compression.
restored = joblib.load("artifact.joblib.gz")

note(
f"Restored name: <code>{restored['name']}</code>, "
f"version <code>{restored['version']}</code>, "
f"features: <code>{restored['feature_names']}</code>."
)

weights_match = np.array_equal(restored["weights"], artifact["weights"])
samples_match = np.array_equal(
restored["training_samples"], artifact["training_samples"],
)
note(
f"Weights round-tripped exactly: <strong>{weights_match}</strong>. "
f"Training samples round-tripped exactly: "
f"<strong>{samples_match}</strong>."
)

# joblib.hash gives a stable fingerprint for arbitrary Python objects,
# which is handy for cache keys and equality checks across processes.
fingerprint = joblib.hash(artifact)
note(f"Stable fingerprint of the artifact: <code>{fingerprint}</code>.")
1 change: 1 addition & 0 deletions examples/joblib/dump_and_load/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["joblib", "numpy"]
17 changes: 17 additions & 0 deletions examples/joblib/dump_and_load/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Setup for the dump/load example. No IPython shim here."""
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
return _display(*args, **kwargs, target=__pyscript_display_target__)


def heading(text, level=2):
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
display(HTML(f"<p>{text}</p>"), append=True)
69 changes: 69 additions & 0 deletions examples/joblib/memory_caching/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
First look at joblib: cache expensive function results to disk
(in Pyodide, an in-memory virtual file system) so repeated calls
return instantly.

See https://joblib.readthedocs.io/en/stable/memory.html
"""
from IPython.core.display import display, HTML

import time
import numpy as np
from joblib import Memory


# A Memory object is the entry point for caching. The `location`
# is a directory where results are persisted; in Pyodide this is
# the in-browser virtual file system, so caches survive within a
# session.
memory = Memory(location="./joblib_cache", verbose=0)


@memory.cache
def slow_square_sum(n):
"""Pretend-expensive computation: sum of squares up to n."""
# Simulate a costly step so the cache benefit is obvious.
time.sleep(0.5)
arr = np.arange(n, dtype=np.int64)
return int((arr * arr).sum())


heading("Caching with joblib.Memory")
note(
"We decorate <code>slow_square_sum</code> with "
"<code>@memory.cache</code>. The first call computes and "
"stores the result; later calls with the same argument are "
"served from the cache."
)

# First call: actually computes (and writes to the cache).
start = time.perf_counter()
result_first = slow_square_sum(200_000)
first_elapsed = time.perf_counter() - start

# Second call with the same input: hits the cache.
start = time.perf_counter()
result_cached = slow_square_sum(200_000)
cached_elapsed = time.perf_counter() - start

# Different input: computes again, populating a new cache entry.
start = time.perf_counter()
result_other = slow_square_sum(50_000)
other_elapsed = time.perf_counter() - start

note(
f"First call (n=200,000): result={result_first:,}, "
f"took <strong>{first_elapsed:.3f}s</strong>."
)
note(
f"Repeat call (n=200,000): result={result_cached:,}, "
f"took <strong>{cached_elapsed:.3f}s</strong> (cache hit)."
)
note(
f"New input (n=50,000): result={result_other:,}, "
f"took <strong>{other_elapsed:.3f}s</strong>."
)

# You can wipe the cache when you want to force recomputation.
memory.clear(warn=False)
note("Called <code>memory.clear()</code> to remove all cached entries.")
1 change: 1 addition & 0 deletions examples/joblib/memory_caching/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["joblib", "numpy"]
40 changes: 40 additions & 0 deletions examples/joblib/memory_caching/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Shim IPython's display API onto PyScript so example code written in a
Jupyter/IPython idiom runs unmodified in the browser.
"""

import sys
import types
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
return _display(
*args, **kwargs, target=__pyscript_display_target__,
)


ipython = types.ModuleType("IPython")
core = types.ModuleType("IPython.core")
core_display = types.ModuleType("IPython.core.display")
core_display.display = display
core_display.HTML = HTML
ipython.core = core
core.display = core_display
ipython.get_ipython = lambda: None
ipython.display = core_display
sys.modules["IPython"] = ipython
sys.modules["IPython.core"] = core
sys.modules["IPython.core.display"] = core_display
sys.modules["IPython.display"] = core_display


def heading(text, level=2):
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
display(HTML(f"<p>{text}</p>"), append=True)
5 changes: 5 additions & 0 deletions examples/joblib/order.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
"memory_caching",
"parallel_loops",
"dump_and_load"
]
77 changes: 77 additions & 0 deletions examples/joblib/parallel_loops/code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# ---------------------------------------------------------------------
# joblib.Parallel: write a parallel loop with the same shape as a
# sequential one. The recipe is:
#
# Parallel(n_jobs=...)(delayed(func)(arg) for arg in args)
#
# `delayed` captures the call without invoking it, and `Parallel`
# dispatches the captured calls. See:
# https://joblib.readthedocs.io/en/stable/parallel.html
# ---------------------------------------------------------------------
import time
import math
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

rng = np.random.default_rng(0)


heading("Parallel loops with joblib")
note(
"We'll estimate &pi; with a Monte Carlo simulation, splitting "
"the work across several batches. Each batch throws random "
"darts at the unit square and counts those landing inside the "
"quarter circle of radius 1."
)


def estimate_pi_batch(n_samples, seed):
"""Return 4 * (fraction of points inside the unit circle)."""
local_rng = np.random.default_rng(seed)
xs = local_rng.random(n_samples)
ys = local_rng.random(n_samples)
inside = int(((xs * xs + ys * ys) <= 1.0).sum())
return 4.0 * inside / n_samples


# Eight batches of 50,000 samples, each with its own seed.
batch_size = 50_000
seeds = list(range(8))

# Sequential baseline: a plain list comprehension.
start = time.perf_counter()
sequential_estimates = [estimate_pi_batch(batch_size, s) for s in seeds]
sequential_elapsed = time.perf_counter() - start

# Parallel version: same shape, wrapped in Parallel/delayed.
# n_jobs=2 keeps the demo lightweight; -1 would use all CPUs.
start = time.perf_counter()
parallel_estimates = Parallel(n_jobs=2)(
delayed(estimate_pi_batch)(batch_size, s) for s in seeds
)
parallel_elapsed = time.perf_counter() - start

combined_pi = float(np.mean(parallel_estimates))
note(
f"Combined estimate of &pi; from {len(seeds)} batches: "
f"<strong>{combined_pi:.5f}</strong> "
f"(error vs math.pi: {abs(combined_pi - math.pi):.5f})."
)
note(
f"Sequential loop: {sequential_elapsed:.3f}s. "
f"Parallel loop: {parallel_elapsed:.3f}s."
)

# Plot the per-batch estimates against the true value.
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(seeds, parallel_estimates, "o-", color="steelblue",
label="Per-batch estimate")
ax.axhline(math.pi, color="crimson", linestyle="--",
label="math.pi")
ax.set_xlabel("Batch (seed)")
ax.set_ylabel("Estimate of \u03c0")
ax.set_title("Monte Carlo estimates of \u03c0 across parallel batches")
ax.legend()
fig.tight_layout()
display(fig, append=True)
1 change: 1 addition & 0 deletions examples/joblib/parallel_loops/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packages = ["joblib", "numpy", "matplotlib"]
18 changes: 18 additions & 0 deletions examples/joblib/parallel_loops/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Setup for the Parallel example. No IPython shim here."""
import js
from pyscript import window, HTML, display as _display

js.alert = window.alert


def display(*args, **kwargs):
return _display(*args, **kwargs, target=__pyscript_display_target__)


def heading(text, level=2):
display(HTML(f"<h{level}>{text}</h{level}>"), append=True)


def note(text):
display(HTML(f"<p>{text}</p>"), append=True)