MlFront_Cache is a framework for transient caches and slowly varying data.

Scenarios

Downloading datasets

We want the dataset available in a long-lived folder. Typically the dataset is versioned so when a new dataset is available the new version should go into another long-lived folder. Eventually, when the original dataset is no longer in use, you would want to delete the original dataset.

Variation 1: The auto-updating dataset may instead be an auto-updating installation. Replace "dataset archive" with "installation archive", and the scenario is the same.

With MlFront_Cache you would:

  • Store the compressed archives (ex. zip files) with transient cache_ops.
  • Store the unzipped datasets with long-lived data_ops.

An inefficient sequence of steps could be:

1. Download a compressed archive of the dataset into the transient cache region. 2. Unzip the dataset archive into the data region.

But that is inefficient because ... when the transient cache is cleared ... the first step is repeated even when the dataset is available in the data region.

Instead we'll:

1. Check if the dataset is in the data region. If so we can stop. 2. Otherwise (ie. "cache miss" in data_ops) we download the zip file into the cache region and then unzip that zip file.

To do that with MlFront_Cache you would:

   let rec unzip_maybe_download
       ~cache_ops:(module CacheOps : MetaOps.S)
       ~data_ops:(module DataOps : MetaOps.S)
       ~category_prefix ~key
       ~downloadurl =
     unzip
       ~data_ops:(module DataOps : MetaOps.S)
       ~category:(category_prefix ^ "dir") ~key
       ~zipfile_f:(fun () ->
         download
           ~cache_ops:(module CacheOps : MetaOps.S)
           ~category:(category_prefix ^ "zip") ~key
           ~downloadurl)
   and download
      ~cache_ops:(module CacheOps : MetaOps.S)
      ~category ~key
      ~downloadurl =
    (* Downloads use CacheOps since they are transient. *)
    CacheOps.cache_file ~category ~key
      ~cache_hit:(fun ~file_for_upsert:_ _downloaded_file ->
        Ok `Keep)
      ~cache_miss:(fun ~file_for_upsert ->
        MlFront_Errors.ExitHandler.proc
          ~problem:(fun () ->
            "Unsuccessful download of " ^ downloadurl)
          (fun () ->
            (* Use whatever tool you like to download the file.
               For DkCoder you may use the following: *)
            let promise =
              DkNet_Std.Http.download_url
                ~checksum:(`SHA_256 "1234567890")
                ~destination:file_for_upsert
                (Tr1Uri_Std.Uri.parse downloadurl)
            in
            Lwt_main.run promise;
            Ok ());
        Ok `Upsert)
      ()
   and unzip
      ~data_ops:(module DataOps : MetaOps.S)
      ~category ~key
      ~zipfile_f =
    let ( let* ) = Result.bind in

    (* Unzipping uses DataOps since they long-lived. *)
    DataOps.cache_dir ~category ~key
      ~cache_hit:(fun ~dir_for_upsert:_ _unzipped_dir ->
        Ok `Keep)
      ~cache_miss:(fun ~dir_for_upsert ->
        let* zipfile = zipfile_f () in
        MlFront_Errors.ExitHandler.proc
          ~problem:(fun () ->
            Format.asprintf "Unsuccessful unzip of %a into %a"
              Fpath.pp zipfile
              Fpath.pp dir_for_upsert)
          (fun () ->
            (* Use whatever tool you like to extract the dataset. *)
            MlFront_ZipFile.ZipFile.extract_exn
              ~srczip:(Fpath.to_string zipfile)
              ~destdir:(Fpath.to_string dir_for_upsert)
              ();
            Ok ());
        Ok `Upsert)
      ()

   (* Get the datasets *)
   let _ =
      Result.get @@
      unzip_maybe_download (* ... *)
        ~category_prefix:"my-dataset-"
        ~downloadurl:"https://mydata.example.com/dataset-2025-01-01.zip"
Sourcemodule MetaDb : sig ... end

MetaDb records details about long-lived and transient data.

Sourcemodule MetaOps : sig ... end

MetaOps are operations that can be performed on the metadata.

Sourcemodule InternalUse : sig ... end