Skip to content

API Reference

Auto-generated reference for the public API, rendered by mkdocstrings.

For task-oriented examples see the Usage guide.

Package

A unified dataset framework for mass spectrometry.

DatasetNotFoundError

Bases: Exception

Raised when the server returns 404 for a dataset.

DownloadError

Bases: Exception

Raised on network or server failures during download.

ExtractionError

Bases: DownloadError

Raised when a server-side extraction task fails.

Dataset dataclass

Dataset(dataset_id: str, dataset_name: str | None, cache_dir: Path, files: list[Path] = list())

Result object returned by load_dataset.

Supports len(), indexing, and iteration over downloaded file paths.

RepoSource

Bases: str, Enum

Supported repository sources for dataset imports.

download_dataset

download_dataset(dataset_id: str, *, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, filenames: list[str] | None = None, store_as: StoreFormat = 'mszx', output_dir: Path | None = None) -> Dataset

Download a dataset and return a Dataset pointing to local files.

Parameters:

Name Type Description Default
dataset_id str

Server-side dataset identifier (UUID).

required
force_download bool

Re-download parts even if they already exist on disk.

False
show_progress bool

Show a rich progress bar during download.

True
max_workers int

Maximum number of parallel downloads.

4
filenames list[str] | None

Optional list of filenames to include. When provided, the server returns a manifest containing only matching parts.

None
store_as StoreFormat

On-disk format for downloaded parts. Defaults to "mszx" (the raw archive shipped by the server). Set to "msz" to extract the inner MSZ, or "mzml" to decompress fully to mzML. Conversion is handled by mstransfer.

'mszx'
output_dir Path | None

Optional destination directory. When set, files are written directly here (no {dataset_id} subdir) and the cache root from get_dataset_dir is bypassed. Useful for one-off downloads outside the shared cache.

None
Source code in src/msdatasets/download.py
def download_dataset(
    dataset_id: str,
    *,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    filenames: list[str] | None = None,
    store_as: StoreFormat = "mszx",
    output_dir: Path | None = None,
) -> Dataset:
    """Download a dataset and return a `Dataset` pointing to local files.

    Parameters
    ----------
    dataset_id:
        Server-side dataset identifier (UUID).
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    filenames:
        Optional list of filenames to include. When provided, the server
        returns a manifest containing only matching parts.
    store_as:
        On-disk format for downloaded parts.  Defaults to ``"mszx"`` (the
        raw archive shipped by the server).  Set to ``"msz"`` to extract
        the inner MSZ, or ``"mzml"`` to decompress fully to mzML.
        Conversion is handled by mstransfer.
    output_dir:
        Optional destination directory.  When set, files are written
        directly here (no ``{dataset_id}`` subdir) and the cache root
        from `get_dataset_dir` is bypassed.  Useful for one-off
        downloads outside the shared cache.
    """
    log.info("Downloading dataset %s", dataset_id)
    dataset_dir = output_dir if output_dir is not None else get_dataset_dir(dataset_id)
    dataset_dir.mkdir(parents=True, exist_ok=True)
    log.debug("Dataset directory: %s", dataset_dir)

    async def _fetch_and_extract() -> tuple[Manifest, list[Path], list[DatasetPart]]:
        timeout = httpx.Timeout(10.0, read=None)
        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
            manifest_ = await fetch_manifest(
                dataset_id, filenames=filenames, client=client
            )

            # Persist manifest locally for offline inspection
            manifest_path = dataset_dir / "manifest.json"
            manifest_path.write_text(manifest_.model_dump_json(indent=2))

            cached: list[Path] = []
            to_download: list[DatasetPart] = []

            # Skip files that are already on disk, unless *force_download* is True.
            # Cache is keyed by the target on-disk filename, so switching
            # --store-as triggers a re-download rather than using a stale
            # artifact in a different format.
            for part in manifest_.parts:
                dest = dataset_dir / _target_filename(part.filename, store_as)
                if dest.exists() and not force_download:
                    log.debug("Cached, skipping: %s", dest.name)
                    cached.append(dest)
                else:
                    to_download.append(part)

            # For files that need to be downloaded,
            # ensure they are extracted and ready on the server.
            if to_download:
                log.info(
                    "Downloading %d/%d part(s)",
                    len(to_download),
                    manifest_.total_parts,
                )
                await ensure_all_extracted(client, to_download)
            else:
                log.info("All %d part(s) already cached", manifest_.total_parts)

            return manifest_, cached, to_download

    manifest, files, parts_to_download = asyncio.run(_fetch_and_extract())

    # Download all ready files via mstransfer.
    if parts_to_download:
        base_url = get_api_url()
        requests = [
            DownloadRequest(
                url=f"{base_url}{part.download_url}",
                dest=dataset_dir / part.filename,
            )
            for part in parts_to_download
        ]

        progress_bar: Progress | None = None
        batch_progress: _RichBatchProgress | None = None
        if show_progress:
            progress_bar = Progress(
                TextColumn("[bold blue]{task.description}"),
                BarColumn(),
                DownloadColumn(),
                TransferSpeedColumn(),
            )
            batch_progress = _RichBatchProgress(progress_bar)

        ctx = progress_bar if progress_bar is not None else _NullContext()
        with ctx:
            downloaded = download_batch(
                requests,
                store_as=store_as,
                parallel=max_workers,
                progress=batch_progress,
            )
            files.extend(downloaded)

    # Ensure files are ordered by part_index.  `files` entries use the
    # target extension (set by mstransfer), so key the lookup by that name.
    file_map = {p.name: p for p in files}
    ordered_files = [
        file_map[target]
        for part in manifest.parts
        if (target := _target_filename(part.filename, store_as)) in file_map
    ]

    ds = Dataset(
        dataset_id=manifest.dataset_id,
        dataset_name=manifest.dataset_name,
        cache_dir=dataset_dir,
        files=ordered_files,
    )
    log.info("Dataset ready: %d file(s) in %s", len(ds), dataset_dir)
    return ds

download_repo_dataset

download_repo_dataset(source: RepoSource | str, accession: str, *, filenames: list[str] | None = None, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, store_as: StoreFormat = 'mszx', output_dir: Path | None = None) -> Dataset

Trigger a repository import and download the resulting dataset.

Posts to /repositories/{source}/projects/{accession}/dataset to create a dataset from a PRIDE or MassIVE project. The endpoint is idempotent—calling it for an already-imported project returns the existing dataset and job statuses.

Parameters:

Name Type Description Default
source RepoSource | str

Repository source ("pride" or "massive").

required
accession str

Project accession (e.g. PXD075509 for PRIDE, MSV000078787 for MassIVE).

required
filenames list[str] | None

Optional list of specific filenames to import. When None, all files in the project are imported.

None
force_download bool

Re-download parts even if they already exist on disk.

False
show_progress bool

Show a rich progress bar during download.

True
max_workers int

Maximum number of parallel downloads.

4
store_as StoreFormat

On-disk format for downloaded parts. See download_dataset.

'mszx'
output_dir Path | None

Optional destination directory. See download_dataset.

None
Source code in src/msdatasets/download.py
def download_repo_dataset(
    source: RepoSource | str,
    accession: str,
    *,
    filenames: list[str] | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    store_as: StoreFormat = "mszx",
    output_dir: Path | None = None,
) -> Dataset:
    """Trigger a repository import and download the resulting dataset.

    Posts to ``/repositories/{source}/projects/{accession}/dataset`` to create
    a dataset from a PRIDE or MassIVE project.  The endpoint is
    idempotent—calling it for an already-imported project returns the
    existing dataset and job statuses.

    Parameters
    ----------
    source:
        Repository source (``"pride"`` or ``"massive"``).
    accession:
        Project accession (e.g. ``PXD075509`` for PRIDE, ``MSV000078787`` for
        MassIVE).
    filenames:
        Optional list of specific filenames to import. When *None*, all
        files in the project are imported.
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    store_as:
        On-disk format for downloaded parts.  See `download_dataset`.
    output_dir:
        Optional destination directory.  See `download_dataset`.
    """
    source = RepoSource(source)
    console = Console(stderr=True)

    async def _import() -> str:
        timeout = httpx.Timeout(10.0, read=None)
        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
            if show_progress:
                progress = Progress(
                    TextColumn("[bold blue]{task.description}"),
                    BarColumn(),
                    DownloadColumn(),
                    TransferSpeedColumn(),
                    TimeRemainingColumn(),
                    console=console,
                    transient=False,
                )
                import_progress = _RichImportProgress(progress)
                with progress:
                    result = await trigger_repo_import(
                        source,
                        accession,
                        filenames=filenames,
                        client=client,
                        on_progress=import_progress.on_progress,
                    )
            else:
                result = await trigger_repo_import(
                    source, accession, filenames=filenames, client=client
                )
            return result.dataset_id

    dataset_id = asyncio.run(_import())

    return download_dataset(
        dataset_id,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
        filenames=filenames,
        store_as=store_as,
        output_dir=output_dir,
    )

load_dataset

load_dataset(dataset_id: str, *, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, load_annotations: list[AnnotationFormat] | None = None) -> MSCompressDataset

Download a dataset and return an MSCompressDataset.

Convenience wrapper around download_dataset that loads the downloaded files into an mscompress.datasets.torch.MSCompressDataset ready for iteration. Requires PyTorch to be installed.

If dataset_id matches the pattern {source}/<accession> (e.g. pride/PXD075509 or massive/MSV000078787), the repository import flow is used instead. A specific filename subset may be specified in square brackets: pride/PXD000001[file1.raw,file2.mzML].

If dataset_id matches hf/<owner>/<repo> (e.g. hf/myorg/proteomics-bench), files are pulled directly from the HuggingFace dataset repo. Requires the hf extra.

Parameters:

Name Type Description Default
dataset_id str

Server-side dataset identifier, or a repository specifier like pride/PXD075509, massive/MSV000078787, or hf/<owner>/<repo>[file1.mszx,file2.mszx].

required
force_download bool

Re-download parts even if they already exist on disk.

False
show_progress bool

Show a rich progress bar during download.

True
max_workers int

Maximum number of parallel downloads.

4
load_annotations list[AnnotationFormat] | None

Annotation formats to load alongside spectra (forwarded to mscompress.datasets.torch.MSCompressDataset). When set, the returned dataset's __getitem__ yields (mz, intensity, annotations_dict) instead of (mz, intensity).

None
Source code in src/msdatasets/download.py
def load_dataset(
    dataset_id: str,
    *,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    load_annotations: list[AnnotationFormat] | None = None,
) -> MSCompressDataset:
    """Download a dataset and return an `MSCompressDataset`.

    Convenience wrapper around `download_dataset` that loads the
    downloaded files into an `mscompress.datasets.torch.MSCompressDataset`
    ready for iteration.  Requires PyTorch to be installed.

    If *dataset_id* matches the pattern ``{source}/<accession>`` (e.g.
    ``pride/PXD075509`` or ``massive/MSV000078787``), the repository import
    flow is used instead.  A specific filename subset may be specified in
    square brackets: ``pride/PXD000001[file1.raw,file2.mzML]``.

    If *dataset_id* matches ``hf/<owner>/<repo>`` (e.g.
    ``hf/myorg/proteomics-bench``), files are pulled directly from the
    HuggingFace dataset repo.  Requires the ``hf`` extra.

    Parameters
    ----------
    dataset_id:
        Server-side dataset identifier, or a repository specifier like
        ``pride/PXD075509``, ``massive/MSV000078787``, or
        ``hf/<owner>/<repo>[file1.mszx,file2.mszx]``.
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    load_annotations:
        Annotation formats to load alongside spectra (forwarded to
        ``mscompress.datasets.torch.MSCompressDataset``).  When set, the
        returned dataset's ``__getitem__`` yields
        ``(mz, intensity, annotations_dict)`` instead of ``(mz, intensity)``.
    """
    hf_spec = _parse_hf_spec(dataset_id)
    if hf_spec is not None:
        from msdatasets.hf import load_hf_dataset

        repo_id, filenames = hf_spec
        return load_hf_dataset(
            repo_id,
            filenames=filenames,
            force_download=force_download,
            show_progress=show_progress,
            load_annotations=load_annotations,
        )

    repo_spec = _parse_repo_spec(dataset_id)
    if repo_spec is not None:
        source, accession, filenames = repo_spec
        return load_repo_dataset(
            source,
            accession,
            filenames=filenames,
            force_download=force_download,
            show_progress=show_progress,
            max_workers=max_workers,
            load_annotations=load_annotations,
        )

    dataset_cls = _import_torch_dataset()

    ds = download_dataset(
        dataset_id,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
    )
    return dataset_cls(ds.cache_dir, load_annotations=load_annotations)

load_repo_dataset

load_repo_dataset(source: RepoSource | str, accession: str, *, filenames: list[str] | None = None, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, load_annotations: list[AnnotationFormat] | None = None) -> MSCompressDataset

Trigger a repository import and return an MSCompressDataset once ready.

Convenience wrapper around download_repo_dataset that loads the downloaded files into an mscompress.datasets.torch.MSCompressDataset. Requires PyTorch to be installed.

load_annotations is forwarded to MSCompressDataset. When set, the dataset's __getitem__ returns (mz, intensity, annotations_dict) instead of just (mz, intensity).

Source code in src/msdatasets/download.py
def load_repo_dataset(
    source: RepoSource | str,
    accession: str,
    *,
    filenames: list[str] | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    load_annotations: list[AnnotationFormat] | None = None,
) -> MSCompressDataset:
    """Trigger a repository import and return an `MSCompressDataset` once ready.

    Convenience wrapper around `download_repo_dataset` that loads the
    downloaded files into an `mscompress.datasets.torch.MSCompressDataset`.
    Requires PyTorch to be installed.

    *load_annotations* is forwarded to ``MSCompressDataset``.  When set, the
    dataset's ``__getitem__`` returns ``(mz, intensity, annotations_dict)``
    instead of just ``(mz, intensity)``.
    """
    dataset_cls = _import_torch_dataset()
    ds = download_repo_dataset(
        source,
        accession,
        filenames=filenames,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
    )
    return dataset_cls(ds.cache_dir, load_annotations=load_annotations)

download_hf_dataset

download_hf_dataset(repo_id: str, *, filenames: list[str] | None = None, revision: str | None = None, token: str | None = None, force_download: bool = False, show_progress: bool = True, output_dir: Path | None = None) -> Dataset

Download a HuggingFace dataset repo of MS files.

Parameters:

Name Type Description Default
repo_id str

HuggingFace dataset repo ID in owner/name form.

required
filenames list[str] | None

Optional list of specific filenames to fetch. When provided, the names are passed through as allow_patterns to snapshot_download.

None
revision str | None

Optional branch, tag, or commit. Defaults to the repo's default branch.

None
token str | None

Optional HF auth token. Falls back to HF_TOKEN and to the token stored by huggingface-cli login.

None
force_download bool

Re-download files even if HF's cache already has them.

False
show_progress bool

When False, sets HF_HUB_DISABLE_PROGRESS_BARS=1 for the call.

True
output_dir Path | None

Optional destination directory. When set, files land here directly (no hf/owner/repo nesting). Otherwise the shared cache is used.

None
Notes

--store-as conversion (mszx → msz / mzml) is not supported in this version. MSCompressDataset reads .mszx natively, so the PyTorch path works end-to-end without conversion.

Source code in src/msdatasets/hf.py
def download_hf_dataset(
    repo_id: str,
    *,
    filenames: list[str] | None = None,
    revision: str | None = None,
    token: str | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    output_dir: Path | None = None,
) -> Dataset:
    """Download a HuggingFace dataset repo of MS files.

    Parameters
    ----------
    repo_id:
        HuggingFace dataset repo ID in ``owner/name`` form.
    filenames:
        Optional list of specific filenames to fetch. When provided, the names
        are passed through as ``allow_patterns`` to ``snapshot_download``.
    revision:
        Optional branch, tag, or commit. Defaults to the repo's default branch.
    token:
        Optional HF auth token. Falls back to ``HF_TOKEN`` and to the token
        stored by ``huggingface-cli login``.
    force_download:
        Re-download files even if HF's cache already has them.
    show_progress:
        When False, sets ``HF_HUB_DISABLE_PROGRESS_BARS=1`` for the call.
    output_dir:
        Optional destination directory. When set, files land here directly
        (no ``hf/owner/repo`` nesting). Otherwise the shared cache is used.

    Notes
    -----
    `--store-as` conversion (mszx → msz / mzml) is not supported in this
    version. ``MSCompressDataset`` reads ``.mszx`` natively, so the PyTorch
    path works end-to-end without conversion.
    """
    snapshot_download, RepoNotFound, RevNotFound, HfHubHTTPError = (  # noqa: N806
        _import_hf_hub()
    )

    dataset_dir = _hf_dataset_dir(repo_id, output_dir)
    dataset_dir.mkdir(parents=True, exist_ok=True)

    allow_patterns = filenames if filenames else _MS_PATTERNS
    log.info("Downloading HuggingFace dataset %s", repo_id)
    log.debug("HF dataset dir: %s", dataset_dir)
    log.debug("HF allow_patterns: %s", allow_patterns)

    try:
        with _hf_progress_disabled(show_progress):
            snapshot_download(
                repo_id=repo_id,
                repo_type="dataset",
                local_dir=dataset_dir,
                revision=revision,
                token=token,
                force_download=force_download,
                allow_patterns=allow_patterns,
            )
    except RepoNotFound as exc:
        raise DatasetNotFoundError(f"HuggingFace dataset not found: {repo_id}") from exc
    except RevNotFound as exc:
        raise DownloadError(f"Revision not found: {revision} in {repo_id}") from exc
    except HfHubHTTPError as exc:
        raise DownloadError(f"HuggingFace download failed: {exc}") from exc

    files = _collect_ms_files(dataset_dir)
    log.info("HF dataset ready: %d file(s) in %s", len(files), dataset_dir)
    return Dataset(
        dataset_id=repo_id,
        dataset_name=repo_id,
        cache_dir=dataset_dir,
        files=files,
    )

load_hf_dataset

load_hf_dataset(repo_id: str, *, filenames: list[str] | None = None, revision: str | None = None, token: str | None = None, force_download: bool = False, show_progress: bool = True, output_dir: Path | None = None, load_annotations: list[AnnotationFormat] | None = None) -> MSCompressDataset

Download an HF dataset repo and return an MSCompressDataset.

Convenience wrapper around download_hf_dataset. Requires PyTorch.

load_annotations is forwarded to MSCompressDataset. When set, the dataset's __getitem__ returns (mz, intensity, annotations_dict) instead of just (mz, intensity).

Source code in src/msdatasets/hf.py
def load_hf_dataset(
    repo_id: str,
    *,
    filenames: list[str] | None = None,
    revision: str | None = None,
    token: str | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    output_dir: Path | None = None,
    load_annotations: list[AnnotationFormat] | None = None,
) -> MSCompressDataset:
    """Download an HF dataset repo and return an `MSCompressDataset`.

    Convenience wrapper around `download_hf_dataset`. Requires PyTorch.

    *load_annotations* is forwarded to ``MSCompressDataset``.  When set, the
    dataset's ``__getitem__`` returns ``(mz, intensity, annotations_dict)``
    instead of just ``(mz, intensity)``.
    """
    from msdatasets.download import _import_torch_dataset

    dataset_cls = _import_torch_dataset()
    ds = download_hf_dataset(
        repo_id,
        filenames=filenames,
        revision=revision,
        token=token,
        force_download=force_download,
        show_progress=show_progress,
        output_dir=output_dir,
    )
    return dataset_cls(ds.cache_dir, load_annotations=load_annotations)

Configuration

msdatasets.config

Configuration: paths, URLs, and environment variables.

get_api_url

get_api_url() -> str

Return the base API URL.

Resolution: MS_API_URL env var, or the default production URL.

Source code in src/msdatasets/config.py
def get_api_url() -> str:
    """Return the base API URL.

    Resolution: ``MS_API_URL`` env var, or the default production URL.
    """
    url = os.environ.get("MS_API_URL", _DEFAULT_API_URL)
    log.debug("API URL: %s", url)
    return url

get_cache_dir

get_cache_dir() -> Path

Return the root cache directory for downloaded datasets.

Resolution order: 1. MS_DATASETS_CACHE env var 2. MS_HOME env var + /datasets 3. ~/.ms/datasets

Source code in src/msdatasets/config.py
def get_cache_dir() -> Path:
    """Return the root cache directory for downloaded datasets.

    Resolution order:
    1. ``MS_DATASETS_CACHE`` env var
    2. ``MS_HOME`` env var + ``/datasets``
    3. ``~/.ms/datasets``
    """
    if env := os.environ.get("MS_DATASETS_CACHE"):
        log.debug("Cache dir from MS_DATASETS_CACHE: %s", env)
        return Path(env)
    if env := os.environ.get("MS_HOME"):
        path = Path(env) / "datasets"
        log.debug("Cache dir from MS_HOME: %s", path)
        return path
    path = Path.home() / ".ms" / "datasets"
    log.debug("Cache dir (default): %s", path)
    return path

get_dataset_dir

get_dataset_dir(dataset_id: str) -> Path

Return the cache directory for a specific dataset.

Source code in src/msdatasets/config.py
def get_dataset_dir(dataset_id: str) -> Path:
    """Return the cache directory for a specific dataset."""
    return get_cache_dir() / dataset_id