I wish to write a library in OCaml using the Eio ecosystem in order to manage data downloads from remote repositories. This should be modeled on the Python Pooch library, as we want to interoperate with its registry files. You can see more about pooch here: https://github.com/fatiando/pooch
The OCaml version should use cohttp-eio and tls-eio to manage HTTPS downloads.
Toru: OCaml Data Repository Manager (Pooch-compatible)#
Overview#
Toru is an OCaml library for managing data file downloads and caching, compatible with Python Pooch registry files. It provides automatic downloading, caching, and hash verification of data files from remote repositories.
Core Design Principles#
- Compatibility: Full interoperability with Pooch registry file format
- Concurrency: Built on Eio for efficient concurrent downloads
- Type Safety: Leverage OCaml's type system for robust error handling
- Modularity: Clean module interfaces with single responsibility
Module Architecture#
1. Hash Module#
module Hash : sig
type algorithm = SHA256 | SHA1 | MD5
type t
val create : algorithm -> string -> t
val of_string : string -> t
val to_string : t -> string
val algorithm_of_string : string -> algorithm option
val algorithm_to_string : algorithm -> string
(* Field accessors *)
val algorithm : t -> algorithm
val value : t -> string
(* Operations *)
val equal : t -> t -> bool
val verify : Eio.Fs.dir_ty Eio.Path.t -> t -> bool
val compute : algorithm -> Eio.Fs.dir_ty Eio.Path.t -> t
(* Parsing helpers *)
val parse_prefixed : string -> (algorithm * string) option
val format_prefixed : t -> string
end
- Abstract
ttype with accessor functions - Parses hash strings with optional algorithm prefixes (e.g., "sha1:abc123...", "md5:def456...")
- Verifies file integrity against expected hashes
- Supports SHA256 (default), SHA1, and MD5
- Enhanced parsing for prefixed and non-prefixed hash formats
2. Registry Module#
module Registry : sig
type t
type entry
(* Entry construction and accessors *)
val create_entry : filename:string -> hash:Hash.t -> ?custom_url:string -> unit -> entry
val filename : entry -> string
val hash : entry -> Hash.t
val custom_url : entry -> string option
(* Registry operations *)
val empty : t
val load : Eio.Fs.dir_ty Eio.Path.t -> t
val load_from_url : string -> t
val save : Eio.Fs.dir_ty Eio.Path.t -> t -> unit
val of_string : string -> t
val to_string : t -> string
(* Query operations *)
val find : string -> t -> entry option
val exists : string -> t -> bool
val add : entry -> t -> t
val remove : string -> t -> t
val entries : t -> entry list
val size : t -> int
end
- Abstract
tandentrytypes with accessor functions - Parses Pooch-compatible registry files
- Supports comments (lines starting with #)
- Format:
filename hashper line - Internal data structure optimized for lookups (may use hash tables)
3. Cache Module#
module Cache : sig
type t
val create : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
?version:string -> string -> t
val default : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
?app_name:string -> unit -> t
(* Field accessors *)
val base_path : t -> Eio.Fs.dir_ty Eio.Path.t
val version : t -> string option
(* Operations *)
val file_path : t -> string -> Eio.Fs.dir_ty Eio.Path.t
val exists : t -> string -> bool
val ensure_dir : t -> unit
val clear : t -> unit
val size_bytes : t -> int64
val list_files : t -> string list
end
- Abstract
ttype with accessor functions - Manages local cache directory structure
- Supports versioned subdirectories
- Uses XDG base directory specification for default paths
- Creates cache directories lazily
4. Downloader Module (Modular Implementation)#
Abstract Interface:
module type DOWNLOADER = sig
type t
val create : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base -> t
val download : t ->
url:string ->
dest:Eio.Fs.dir_ty Eio.Path.t ->
?hash:Hash.t ->
?progress:Progress_reporter.t ->
?resume:bool ->
unit -> (unit, string) result
val supports_resume : t -> bool
val name : t -> string
end
module Downloader : sig
type t
val wget : (module DOWNLOADER with type t = 'a) -> 'a -> t
val curl : (module DOWNLOADER with type t = 'a) -> 'a -> t
val cohttp : (module DOWNLOADER with type t = 'a) -> 'a -> t
val download : t ->
url:string ->
dest:Eio.Fs.dir_ty Eio.Path.t ->
?hash:Hash.t ->
?progress:Progress_reporter.t ->
?resume:bool ->
unit -> (unit, string) result
val supports_resume : t -> bool
val name : t -> string
end
Wget Implementation:
module Wget_downloader : DOWNLOADER = struct
type t = {
sw : Eio.Switch.t;
env : Eio_unix.Stdenv.base;
timeout : float;
}
let create ~sw ~env = { sw; env; timeout = 300.0 }
let download t ~url ~dest ?(hash=None) ?(progress=None) ?(resume=true) () =
let args = [
"--quiet"; "--show-progress";
"--timeout=300"; "--tries=3";
"--output-document=" ^ (Eio.Path.native_exn dest);
] in
let args = if resume then "--continue" :: args else args in
let args = url :: args in
let result = Eio.Process.run t.env#process_mgr ~sw:t.sw
"wget" ~args:(Array.of_list args) in
match result with
| Ok () ->
(match hash with
| Some h -> if Hash.verify dest h then Ok ()
else Error "Hash verification failed"
| None -> Ok ())
| Error (`Exit_code code) ->
Error (Printf.sprintf "wget failed with code %d" code)
let supports_resume _ = true
let name _ = "wget"
end
Curl Implementation:
module Curl_downloader : DOWNLOADER = struct
type t = {
sw : Eio.Switch.t;
env : Eio_unix.Stdenv.base;
timeout : float;
}
let create ~sw ~env = { sw; env; timeout = 300.0 }
let download t ~url ~dest ?(hash=None) ?(progress=None) ?(resume=true) () =
let args = [
"--silent"; "--show-error"; "--location";
"--max-time"; "300"; "--retry"; "3";
"--output"; (Eio.Path.native_exn dest);
] in
let args = if resume then "--continue-at" :: "-" :: args else args in
let args = url :: args in
let result = Eio.Process.run t.env#process_mgr ~sw:t.sw
"curl" ~args:(Array.of_list args) in
match result with
| Ok () ->
(match hash with
| Some h -> if Hash.verify dest h then Ok ()
else Error "Hash verification failed"
| None -> Ok ())
| Error (`Exit_code code) ->
Error (Printf.sprintf "curl failed with code %d" code)
let supports_resume _ = true
let name _ = "curl"
end
Future Cohttp-Eio Implementation:
module Cohttp_downloader : DOWNLOADER = struct
type t = {
sw : Eio.Switch.t;
net : Eio.Net.t;
timeout : float;
}
let create ~sw ~env = {
sw;
net = env#net;
timeout = 300.0
}
let download t ~url ~dest ?(hash=None) ?(progress=None) ?(resume=false) () =
(* Pure OCaml implementation using cohttp-eio *)
(* Will support streaming, progress reporting, and range requests *)
failwith "TODO: Implement cohttp-eio downloader"
let supports_resume _ = true (* Will support when implemented *)
let name _ = "cohttp-eio"
end
Benefits:
- Immediate functionality: wget/curl provide robust, battle-tested downloading
- Built-in resume support: Both tools handle partial downloads automatically
- Easy migration path: Drop-in replacement when cohttp-eio implementation ready
- Fallback strategy: Try multiple downloaders if one fails
- Consistent interface: Same API regardless of underlying implementation
5. Toru Module (Main Interface)#
module Toru : sig
type t
val create :
sw:Eio.Switch.t ->
env:Eio_unix.Stdenv.base ->
base_url:string ->
cache_path:string ->
?version:string ->
?registry_file:string ->
?downloader:(module DOWNLOADER) ->
unit -> t
(* Field accessors *)
val base_url : t -> string
val cache : t -> Cache.t
val registry : t -> Registry.t
(* Operations *)
val fetch :
t ->
filename:string ->
?processor:(Eio.Fs.dir_ty Eio.Path.t -> Eio.Fs.dir_ty Eio.Path.t) ->
unit -> (Eio.Fs.dir_ty Eio.Path.t, string) result
val fetch_all :
t ->
?concurrency:int ->
unit -> (unit, string) result
val load_registry : t -> string -> t
val add_registry_entry : t -> Registry.entry -> t
val update_base_url : t -> string -> t
(* Static functions *)
val retrieve :
sw:Eio.Switch.t ->
env:Eio_unix.Stdenv.base ->
url:string ->
?hash:Hash.t ->
?cache_path:string ->
?downloader:(module DOWNLOADER) ->
unit -> (Eio.Fs.dir_ty Eio.Path.t, string) result
end
- Abstract
ttype with accessor functions - High-level API matching Pooch's functionality
fetch: Download/retrieve single file from registryfetch_all: Download all registry files concurrentlyretrieve: One-off download without registry- Supports post-processing hooks for decompression, etc.
Key Features#
1. Registry Compatibility#
- Reads Pooch registry files without modification
- Supports all hash formats (plain, sha1:, md5:)
- Handles comments and blank lines
- Validated with tessera-manifests: Real-world geospatial data registries
2. Concurrent Downloads#
- Uses Eio fibers for parallel downloads
- Configurable concurrency limits
- Progress reporting through OCaml
progresslibrary
3. Robust Error Handling#
- Result types for all fallible operations
- Detailed error messages
- Automatic retry with backoff
4. Extensibility#
- Processor functions for post-download transformations
- Pluggable download protocols
- Custom cache locations
- Modular downloaders: External tools (wget/curl) with migration path to pure OCaml
Example Usage#
open Eio.Std
let main ~env ~sw =
(* Create a Toru instance *)
let toru = Toru.create ~sw ~env
~base_url:"https://github.com/myorg/data/raw/main/"
~cache_path:"~/.myapp/data"
~version:"v1.0"
~registry_file:"registry.txt"
() in
(* Fetch a single file *)
match Toru.fetch toru ~filename:"data.csv" () with
| Ok path ->
traceln "File available at: %s" (Eio.Path.native_exn path);
traceln "Cache location: %s" (Eio.Path.native_exn (Cache.base_path (Toru.cache toru)))
| Error msg ->
traceln "Failed to fetch: %s" msg
(* Download all files in registry *)
match Toru.fetch_all toru ~concurrency:4 () with
| Ok () ->
let registry = Toru.registry toru in
traceln "Downloaded %d files from %s"
(Registry.size registry) (Toru.base_url toru)
| Error msg -> traceln "Download failed: %s" msg
Implementation Plan#
-
Phase 1: Core Modules
- Implement Hash module with verification
- Create Registry parser and writer
- Build Cache management system
- Key Test: Validate with tessera-manifests registry files
-
Phase 2: External Tool Integration
- Implement modular Downloader interface
- Create Wget_downloader wrapper with resume support
- Create Curl_downloader wrapper with resume support
- Add automatic tool detection and fallback
- Key Test: Download tessera geospatial tiles via external tools
-
Phase 3: Main Interface
- Build Toru module combining all components
- Add concurrent download support via external tools
- Implement processor pipeline for decompression
- Key Test: Full tessera-manifests integration test
-
Phase 4: Pure OCaml Migration
- Implement Cohttp_downloader with streaming
- Add Range request support for resumption
- Migrate from external tools to pure OCaml
- Key Test: Ensure tessera compatibility maintained with pure OCaml
-
Phase 5: Extensions
- Add FTP protocol support
- Enhance progress reporting integration
- Add authentication mechanisms
Dependencies#
Core Dependencies:
eio(>= 1.0): Effects-based I/O and process managementdigestif(>= 1.0): Cryptographic hashes (SHA256, SHA1, MD5)uri: URL parsing and validationprogress: Download progress reportingyojson: JSON parsing for configurationcmdliner: CLI argument parsing and downloader selection
External Tool Dependencies:
wgetorcurl: System tools for downloading (one required)
Future Pure OCaml Dependencies:
cohttp-eio: HTTP client (for Phase 4)tls-eio: TLS support (for Phase 4)
Optional Dependencies:
tar: For .tar.gz/.tar.xz archive processingunzip: For .zip archive processing
Environment Variables and Configuration#
Following Pooch's approach, Toru supports environment-based configuration:
Cache Location Override#
TORU_CACHE_DIR: Override default cache location (like Pooch'senvparameter)XDG_CACHE_HOME: Follows XDG Base Directory specification on Unix systems- Default paths:
- macOS:
~/Library/Caches/<app_name> - Unix:
~/.cache/<app_name> - Windows:
%LOCALAPPDATA%\<app_name>\Cache
- macOS:
Registry Configuration#
TORU_REGISTRY_URL: Override registry file URLTORU_BASE_URL: Override base download URLTORU_VERSION: Override data version
Example Usage#
(* Environment: TORU_CACHE_DIR=/custom/cache *)
let toru = Toru.create ~sw ~env
~base_url:"https://data.example.com/"
~cache_path:(Toru.default_cache_path ~app_name:"myapp" ())
~env_override:"TORU_CACHE_DIR" (* Uses env var if set *)
() in
Progress Reporting#
Integration with OCaml progress library for download tracking:
module Progress_reporter : sig
type t
val create : ?total_bytes:int64 -> string -> t
val update : t -> int64 -> unit
val finish : t -> unit
end
(* Updated Downloader signature *)
val download : t ->
url:string ->
dest:Eio.Fs.dir_ty Eio.Path.t ->
?hash:Hash.t ->
?progress:Progress_reporter.t ->
unit -> (unit, string) result
Progress bars show:
- Download speed (bytes/sec)
- ETA and percentage complete
- File name and size
- Multiple concurrent downloads with separate bars
Archive Decompression#
Built-in processors for common archive formats via shell tools:
module Processors : sig
val untar_gz : string -> (Eio.Fs.dir_ty Eio.Path.t -> Eio.Fs.dir_ty Eio.Path.t)
val unzip : string -> (Eio.Fs.dir_ty Eio.Path.t -> Eio.Fs.dir_ty Eio.Path.t)
val untar_xz : string -> (Eio.Fs.dir_ty Eio.Path.t -> Eio.Fs.dir_ty Eio.Path.t)
val custom : string -> string list -> (Eio.Fs.dir_ty Eio.Path.t -> Eio.Fs.dir_ty Eio.Path.t)
end
(* Usage *)
let path = Toru.fetch toru ~filename:"data.tar.gz"
~processor:(Processors.untar_gz "data/") () in
Processors shell out to system tools:
tar -xzffor .tar.gz filesunzipfor .zip filestar -xJffor .tar.xz files- Custom commands for other formats
Checkpointing Complexity Analysis#
Complexity: Medium-High
Implementing resumable downloads requires:
-
Range Request Support
- HTTP Range headers (
bytes=start-end) - Server must support partial content (206 status)
- Handle servers that don't support ranges gracefully
- HTTP Range headers (
-
Partial File Management
- Track download state in metadata files (
.toru_partial) - Store: URL, expected hash, bytes downloaded, timestamp
- Cleanup abandoned partials after timeout
- Track download state in metadata files (
-
Hash Verification Challenges
- Can't verify hash until download complete
- Need to handle corrupted partial downloads
- Resume from last known good state
-
Error Handling Complexity
- Network interruptions during partial downloads
- Server-side file changes between resume attempts
- Concurrent access to same partial file
Recommended Approach:
- Phase 1: Implement without checkpointing
- Phase 2: Add simple restart-based "checkpointing" (delete and restart)
- Phase 3: True resumable downloads with Range support
Chosen Approach: Use external tools (wget -c and curl -C -) initially for resume capability, with modular design allowing migration to pure OCaml implementation later.
Downloader Selection and CLI Integration#
module Downloaders : sig
val wget : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
(module DOWNLOADER with type t = Wget_downloader.t)
val curl : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
(module DOWNLOADER with type t = Curl_downloader.t)
val cohttp : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
(module DOWNLOADER with type t = Cohttp_downloader.t)
val detect_available : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
(string * (module DOWNLOADER)) list
val create_default : sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
(module DOWNLOADER)
val of_string : string -> sw:Eio.Switch.t -> env:Eio_unix.Stdenv.base ->
(module DOWNLOADER) option
end
(* Cmdliner integration *)
module Cli : sig
type downloader_choice = [ `Wget | `Curl | `Cohttp | `Auto ]
val downloader_term : downloader_choice Cmdliner.Term.t
val downloader_info : Cmdliner.Arg.info
val create_downloader :
sw:Eio.Switch.t ->
env:Eio_unix.Stdenv.base ->
downloader_choice ->
(module DOWNLOADER)
end
let downloader_term =
let open Cmdliner in
let choices = ["wget"; "curl"; "cohttp"; "auto"] in
let doc = "Download tool to use. 'auto' detects available tools." in
let docv = "TOOL" in
Arg.(value & opt (enum [
("wget", `Wget); ("curl", `Curl);
("cohttp", `Cohttp); ("auto", `Auto)
]) `Auto & info ["downloader"; "d"] ~doc ~docv)
let create_downloader ~sw ~env = function
| `Wget -> Downloaders.wget ~sw ~env
| `Curl -> Downloaders.curl ~sw ~env
| `Cohttp -> Downloaders.cohttp ~sw ~env
| `Auto -> Downloaders.create_default ~sw ~env
(* Example usage in an application *)
let main_term downloader_choice cache_dir base_url =
Eio_main.run @@ fun env ->
Eio.Switch.run @@ fun sw ->
let module Downloader = Cli.create_downloader ~sw ~env downloader_choice in
let toru = Toru.create ~sw ~env ~downloader:(module Downloader)
~cache_path:cache_dir ~base_url () in
(* ... use toru ... *)
Toru-DOI: DOI Resolution Library#
Overview#
Separate library for resolving DOIs to download URLs, designed to work seamlessly with Toru.
Core Design#
module Toru_doi : sig
type repository = Zenodo | Figshare | Dryad | Custom of string
type doi_info = {
doi : string;
repository : repository;
files : file_info list;
metadata : (string * string) list;
}
and file_info = {
name : string;
size : int64 option;
download_url : string;
checksum : string option;
checksum_type : string option;
}
val resolve :
sw:Eio.Switch.t ->
net:Eio.Net.t ->
string ->
(doi_info, string) result
val download_url : doi_info -> string -> string option
val to_registry_entries : doi_info -> Toru.Registry.entry list
end
Repository-Specific Resolvers#
Zenodo Integration#
module Zenodo : sig
type record = {
id : int;
conceptdoi : string;
conceptrecid : int;
files : file_info list;
metadata : metadata;
}
val resolve_doi : sw:Eio.Switch.t -> net:Eio.Net.t ->
string -> (record, string) result
val latest_version : sw:Eio.Switch.t -> net:Eio.Net.t ->
string -> (record, string) result
end
Figshare Integration#
module Figshare : sig
type article = {
id : int;
title : string;
doi : string;
files : file_info list;
}
val resolve_doi : sw:Eio.Switch.t -> net:Eio.Net.t ->
string -> (article, string) result
end
DOI Resolution Workflow#
- Parse DOI: Extract repository type from DOI prefix
- API Query: Repository-specific API calls to get metadata
- Extract Files: Parse file listings from API responses
- Generate Registry: Convert to Toru registry format
- Cache Metadata: Store DOI resolution results locally
Integration with Toru#
(* Create registry from DOI *)
let doi_registry = Toru_doi.resolve ~sw ~net "10.5281/zenodo.1234567"
|> Result.map Toru_doi.to_registry_entries in
(* Use with Toru *)
let toru = Toru.create ~sw ~env
~base_url:"" (* Not used for DOI downloads *)
~cache_path:"~/.myapp/data"
~registry:(Result.get_ok doi_registry)
() in
let file_path = Toru.fetch toru ~filename:"data.csv" () in
API Endpoints Used#
- Zenodo:
https://zenodo.org/api/records/{id} - Figshare:
https://api.figshare.com/v2/articles/{id} - DataCite:
https://api.datacite.org/dois/{doi}(for metadata) - CrossRef:
https://api.crossref.org/works/{doi}(fallback)
Features#
- Version Resolution: Get latest version or specific version
- Batch Processing: Resolve multiple DOIs concurrently
- Metadata Caching: Cache API responses to avoid rate limits
- Rate Limiting: Respect repository API rate limits
- Fallback Chain: Try multiple APIs if primary fails
Error Handling#
- DOI not found (404)
- Repository API rate limits (429)
- Invalid DOI format
- Repository-specific errors
- Network timeouts
Dependencies#
toru: Core downloading functionalityeio: Async I/Ocohttp-eio: HTTP clientyojson: JSON parsing for API responsesuri: DOI and URL parsingptime: Timestamp handling for caching
Registry Parser Test Cases#
Tessera-Manifests Compatibility Test#
The tessera-manifests repository provides excellent real-world test cases for registry parsing, containing geospatial data manifests in Pooch-compatible format.
Test Case Structure#
Embeddings Manifests (registry/embeddings/):
# Example: embeddings_2024_lon-10_lat50.txt
2024/grid_-5.05_50.05/grid_-5.05_50.05.npy d1f947c87017eebc8b98d6c3944eaea813ddcfb6ceafa96db0bb70675abd4f28
2024/grid_-5.05_50.05/grid_-5.05_50.05_scales.npy f8c3b2e7a1d4c5f6e9a2b3c4d5e6f7g8h9i0j1k2l3m4n5o6p7q8r9s0t1u2v3w4x5
Landmasks Manifests (registry/landmasks/):
# Example: landmasks_lon-10_lat50.txt
grid_-5.05_50.05.tiff 3f7d8e2a6b9c1e4f7a2d5c8b9e0f3a6b9c2e5d8f1a4c7e0d3f6a9c2e5d8f1a4c7e0
grid_-5.00_50.05.tiff a8b5f2c9d6e3a0f7c4d1e8b5f2c9d6e3a0f7c4d1e8b5f2c9d6e3a0f7c4d1e8b5f2c9
Registry Parser Test Suite#
module Test_tessera_manifests = struct
let test_embeddings_registry () =
let manifest_url = "https://raw.githubusercontent.com/ucam-eo/tessera-manifests/main/registry/embeddings/embeddings_2024_lon-10_lat50.txt" in
(* Test parsing from URL *)
let registry = Registry.load_from_url manifest_url in
(* Verify specific entries *)
let expected_hash1 = Hash.of_string "d1f947c87017eebc8b98d6c3944eaea813ddcfb6ceafa96db0bb70675abd4f28" in
let expected_hash2 = Hash.of_string "f8c3b2e7a1d4c5f6e9a2b3c4d5e6f7g8h9i0j1k2l3m4n5o6p7q8r9s0t1u2v3w4x5" in
let filenames = [
("2024/grid_-5.05_50.05/grid_-5.05_50.05.npy", expected_hash1);
("2024/grid_-5.05_50.05/grid_-5.05_50.05_scales.npy", expected_hash2);
] in
List.iter (fun (filename, expected_hash) ->
match Registry.find filename registry with
| Some entry -> assert (Hash.equal (Registry.hash entry) expected_hash)
| None -> failwith ("Entry not found: " ^ filename)
) filenames
let test_landmasks_registry () =
let manifest_url = "https://raw.githubusercontent.com/ucam-eo/tessera-manifests/main/registry/landmasks/landmasks_lon-10_lat50.txt" in
let registry = Registry.load_from_url manifest_url in
(* Test TIFF file entries *)
let tiff_entry = Registry.find "grid_-5.05_50.05.tiff" registry in
match tiff_entry with
| Some entry ->
let hash = Registry.hash entry in
assert (Hash.algorithm hash = SHA256);
assert (String.length (Hash.value hash) = 64)
| None -> failwith "TIFF entry not found"
let test_geographic_parsing () =
(* Test parsing of geographic coordinates from filenames *)
let filenames = [
"embeddings_2024_lon-180_lat-30.txt"; (* Negative longitude *)
"embeddings_2024_lon100_lat20.txt"; (* Positive coordinates *)
"landmasks_lon-10_lat50.txt"; (* Mixed signs *)
] in
List.iter (fun filename ->
let coords = parse_geographic_coords filename in
assert (coords.longitude >= -180.0 && coords.longitude <= 180.0);
assert (coords.latitude >= -90.0 && coords.latitude <= 90.0)
) filenames
let test_large_manifest_parsing () =
(* Test performance with large manifest files *)
let large_manifest = "https://raw.githubusercontent.com/ucam-eo/tessera-manifests/main/registry/embeddings/embeddings_2024_lon-10_lat50.txt" in
let start_time = Unix.gettimeofday () in
let registry = Registry.load_from_url large_manifest in
let parse_time = Unix.gettimeofday () -. start_time in
(* Should parse reasonably quickly *)
assert (parse_time < 5.0);
(* Should contain expected number of entries *)
assert (List.length registry > 100);
(* All entries should have valid SHA256 hashes *)
List.iter (fun entry ->
let hash = Registry.hash entry in
assert (Hash.algorithm hash = SHA256);
assert (String.length (Hash.value hash) = 64)
) (Registry.entries registry)
end
Integration Test#
let test_tessera_integration () =
Eio_main.run @@ fun env ->
Eio.Switch.run @@ fun sw ->
(* Create Toru instance for tessera manifests *)
let base_url = "https://huggingface.co/datasets/tessera-research/tessera-tiles/resolve/main/" in
let manifest_url = "https://raw.githubusercontent.com/ucam-eo/tessera-manifests/main/registry/embeddings/embeddings_2024_lon-10_lat50.txt" in
let toru = Toru.create ~sw ~env
~base_url
~cache_path:"/tmp/tessera_test"
~registry_url:manifest_url
() in
(* Test fetching a specific grid tile *)
match Toru.fetch toru ~filename:"2024/grid_-5.05_50.05/grid_-5.05_50.05.npy" () with
| Ok path ->
(* Verify file exists and hash matches *)
assert (Eio.Path.exists path);
let registry = Toru.registry toru in
let entry = Registry.find "2024/grid_-5.05_50.05/grid_-5.05_50.05.npy" registry in
match entry with
| Some e -> assert (Hash.verify path (Registry.hash e))
| None -> failwith "Registry entry missing"
| Error msg -> failwith ("Download failed: " ^ msg)
Benefits as Test Case#
- Real-world data: Actual production manifests with geospatial data
- Scale testing: Large files with hundreds of entries
- Format validation: Pure Pooch-compatible format
- Geographic diversity: Tests coordinate parsing across globe
- Temporal diversity: Multiple years (2017-2024) of data
- File type variety:
.npy,_scales.npy,.tifffiles - Hash validation: All entries use SHA256 checksums
This provides comprehensive test coverage for the registry parsing functionality while using real data that demonstrates practical usage patterns.