Skip to content

API Reference

Auto-generated reference for the public API, rendered by mkdocstrings.

For task-oriented examples see the Usage guide.

Package

A unified dataset framework for mass spectrometry.

DatasetNotFoundError

Bases: Exception

Raised when the server returns 404 for a dataset.

DownloadError

Bases: Exception

Raised on network or server failures during download.

ExtractionError

Bases: DownloadError

Raised when a server-side extraction task fails.

Dataset dataclass

Dataset(dataset_id: str, dataset_name: str | None, cache_dir: Path, files: list[Path] = list())

Result object returned by load_dataset.

Supports len(), indexing, and iteration over downloaded file paths.

RepoSource

Bases: str, Enum

Supported repository sources for dataset imports.

download_dataset

download_dataset(dataset_id: str, *, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, filenames: list[str] | None = None, store_as: StoreFormat = 'mszx', output_dir: Path | None = None) -> Dataset

Download a dataset and return a Dataset pointing to local files.

Parameters:

Name Type Description Default
dataset_id str

Server-side dataset identifier (UUID).

required
force_download bool

Re-download parts even if they already exist on disk.

False
show_progress bool

Show a rich progress bar during download.

True
max_workers int

Maximum number of parallel downloads.

4
filenames list[str] | None

Optional list of filenames to include. When provided, the server returns a manifest containing only matching parts.

None
store_as StoreFormat

On-disk format for downloaded parts. Defaults to "mszx" (the raw archive shipped by the server). Set to "msz" to extract the inner MSZ, or "mzml" to decompress fully to mzML. Conversion is handled by mstransfer.

'mszx'
output_dir Path | None

Optional destination directory. When set, files are written directly here (no {dataset_id} subdir) and the cache root from get_dataset_dir is bypassed. Useful for one-off downloads outside the shared cache.

None
Source code in src/msdatasets/download.py
def download_dataset(
    dataset_id: str,
    *,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    filenames: list[str] | None = None,
    store_as: StoreFormat = "mszx",
    output_dir: Path | None = None,
) -> Dataset:
    """Download a dataset and return a `Dataset` pointing to local files.

    Parameters
    ----------
    dataset_id:
        Server-side dataset identifier (UUID).
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    filenames:
        Optional list of filenames to include. When provided, the server
        returns a manifest containing only matching parts.
    store_as:
        On-disk format for downloaded parts.  Defaults to ``"mszx"`` (the
        raw archive shipped by the server).  Set to ``"msz"`` to extract
        the inner MSZ, or ``"mzml"`` to decompress fully to mzML.
        Conversion is handled by mstransfer.
    output_dir:
        Optional destination directory.  When set, files are written
        directly here (no ``{dataset_id}`` subdir) and the cache root
        from `get_dataset_dir` is bypassed.  Useful for one-off
        downloads outside the shared cache.
    """
    log.info("Downloading dataset %s", dataset_id)
    dataset_dir = output_dir if output_dir is not None else get_dataset_dir(dataset_id)
    dataset_dir.mkdir(parents=True, exist_ok=True)
    log.debug("Dataset directory: %s", dataset_dir)

    async def _fetch_and_extract() -> tuple[Manifest, list[Path], list[DatasetPart]]:
        timeout = httpx.Timeout(10.0, read=None)
        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
            manifest_ = await fetch_manifest(
                dataset_id, filenames=filenames, client=client
            )

            # Persist manifest locally for offline inspection
            manifest_path = dataset_dir / "manifest.json"
            manifest_path.write_text(manifest_.model_dump_json(indent=2))

            cached: list[Path] = []
            to_download: list[DatasetPart] = []

            # Skip files that are already on disk, unless *force_download* is True.
            # Cache is keyed by the target on-disk filename, so switching
            # --store-as triggers a re-download rather than using a stale
            # artifact in a different format.
            for part in manifest_.parts:
                dest = dataset_dir / _target_filename(part.filename, store_as)
                if dest.exists() and not force_download:
                    log.debug("Cached, skipping: %s", dest.name)
                    cached.append(dest)
                else:
                    to_download.append(part)

            # For files that need to be downloaded,
            # ensure they are extracted and ready on the server.
            if to_download:
                log.info(
                    "Downloading %d/%d part(s)",
                    len(to_download),
                    manifest_.total_parts,
                )
                await ensure_all_extracted(client, to_download)
            else:
                log.info("All %d part(s) already cached", manifest_.total_parts)

            return manifest_, cached, to_download

    manifest, files, parts_to_download = asyncio.run(_fetch_and_extract())

    # Download all ready files via mstransfer.
    if parts_to_download:
        base_url = get_api_url()
        requests = [
            DownloadRequest(
                url=f"{base_url}{part.download_url}",
                dest=dataset_dir / part.filename,
            )
            for part in parts_to_download
        ]

        progress_bar: Progress | None = None
        batch_progress: _RichBatchProgress | None = None
        if show_progress:
            progress_bar = Progress(
                TextColumn("[bold blue]{task.description}"),
                BarColumn(),
                DownloadColumn(),
                TransferSpeedColumn(),
            )
            batch_progress = _RichBatchProgress(progress_bar)

        ctx = progress_bar if progress_bar is not None else _NullContext()
        with ctx:
            downloaded = download_batch(
                requests,
                store_as=store_as,
                parallel=max_workers,
                progress=batch_progress,
            )
            files.extend(downloaded)

    # Ensure files are ordered by part_index.  `files` entries use the
    # target extension (set by mstransfer), so key the lookup by that name.
    file_map = {p.name: p for p in files}
    ordered_files = [
        file_map[target]
        for part in manifest.parts
        if (target := _target_filename(part.filename, store_as)) in file_map
    ]

    ds = Dataset(
        dataset_id=manifest.dataset_id,
        dataset_name=manifest.dataset_name,
        cache_dir=dataset_dir,
        files=ordered_files,
    )
    log.info("Dataset ready: %d file(s) in %s", len(ds), dataset_dir)
    return ds

download_repo_dataset

download_repo_dataset(source: RepoSource | str, accession: str, *, filenames: list[str] | None = None, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, store_as: StoreFormat = 'mszx', output_dir: Path | None = None) -> Dataset

Trigger a repository import and download the resulting dataset.

Posts to /repositories/{source}/projects/{accession}/dataset to create a dataset from a PRIDE or MassIVE project. The endpoint is idempotent—calling it for an already-imported project returns the existing dataset and job statuses.

Parameters:

Name Type Description Default
source RepoSource | str

Repository source ("pride" or "massive").

required
accession str

Project accession (e.g. PXD075509 for PRIDE, MSV000078787 for MassIVE).

required
filenames list[str] | None

Optional list of specific filenames to import. When None, all files in the project are imported.

None
force_download bool

Re-download parts even if they already exist on disk.

False
show_progress bool

Show a rich progress bar during download.

True
max_workers int

Maximum number of parallel downloads.

4
store_as StoreFormat

On-disk format for downloaded parts. See download_dataset.

'mszx'
output_dir Path | None

Optional destination directory. See download_dataset.

None
Source code in src/msdatasets/download.py
def download_repo_dataset(
    source: RepoSource | str,
    accession: str,
    *,
    filenames: list[str] | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    store_as: StoreFormat = "mszx",
    output_dir: Path | None = None,
) -> Dataset:
    """Trigger a repository import and download the resulting dataset.

    Posts to ``/repositories/{source}/projects/{accession}/dataset`` to create
    a dataset from a PRIDE or MassIVE project.  The endpoint is
    idempotent—calling it for an already-imported project returns the
    existing dataset and job statuses.

    Parameters
    ----------
    source:
        Repository source (``"pride"`` or ``"massive"``).
    accession:
        Project accession (e.g. ``PXD075509`` for PRIDE, ``MSV000078787`` for
        MassIVE).
    filenames:
        Optional list of specific filenames to import. When *None*, all
        files in the project are imported.
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    store_as:
        On-disk format for downloaded parts.  See `download_dataset`.
    output_dir:
        Optional destination directory.  See `download_dataset`.
    """
    source = RepoSource(source)
    console = Console(stderr=True)

    async def _import() -> str:
        timeout = httpx.Timeout(10.0, read=None)
        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
            if show_progress:
                with console.status(
                    f"[bold blue]{source.value} import {accession}: pending…",
                    spinner="dots",
                ) as spinner:

                    def _on_status(file_name: str, status: RepoImportStatus) -> None:
                        label = _STATUS_LABELS.get(status, status.value)
                        spinner.update(f"[bold blue]{file_name}: {label}…")

                    result = await trigger_repo_import(
                        source,
                        accession,
                        filenames=filenames,
                        client=client,
                        on_status=_on_status,
                    )
            else:
                result = await trigger_repo_import(
                    source, accession, filenames=filenames, client=client
                )
            return result.dataset_id

    dataset_id = asyncio.run(_import())

    return download_dataset(
        dataset_id,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
        filenames=filenames,
        store_as=store_as,
        output_dir=output_dir,
    )

load_dataset

load_dataset(dataset_id: str, *, force_download: bool = False, show_progress: bool = True, max_workers: int = 4) -> MSCompressDataset

Download a dataset and return an MSCompressDataset.

Convenience wrapper around download_dataset that loads the downloaded files into an mscompress.datasets.torch.MSCompressDataset ready for iteration. Requires PyTorch to be installed.

If dataset_id matches the pattern {source}/<accession> (e.g. pride/PXD075509 or massive/MSV000078787), the repository import flow is used instead. A specific filename subset may be specified in square brackets: pride/PXD000001[file1.raw,file2.mzML].

Parameters:

Name Type Description Default
dataset_id str

Server-side dataset identifier, or a repository specifier like pride/PXD075509 or massive/MSV000078787.

required
force_download bool

Re-download parts even if they already exist on disk.

False
show_progress bool

Show a rich progress bar during download.

True
max_workers int

Maximum number of parallel downloads.

4
Source code in src/msdatasets/download.py
def load_dataset(
    dataset_id: str,
    *,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
) -> MSCompressDataset:
    """Download a dataset and return an `MSCompressDataset`.

    Convenience wrapper around `download_dataset` that loads the
    downloaded files into an `mscompress.datasets.torch.MSCompressDataset`
    ready for iteration.  Requires PyTorch to be installed.

    If *dataset_id* matches the pattern ``{source}/<accession>`` (e.g.
    ``pride/PXD075509`` or ``massive/MSV000078787``), the repository import
    flow is used instead.  A specific filename subset may be specified in
    square brackets: ``pride/PXD000001[file1.raw,file2.mzML]``.

    Parameters
    ----------
    dataset_id:
        Server-side dataset identifier, or a repository specifier like
        ``pride/PXD075509`` or ``massive/MSV000078787``.
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    """
    repo_spec = _parse_repo_spec(dataset_id)
    if repo_spec is not None:
        source, accession, filenames = repo_spec
        return load_repo_dataset(
            source,
            accession,
            filenames=filenames,
            force_download=force_download,
            show_progress=show_progress,
            max_workers=max_workers,
        )

    dataset_cls = _import_torch_dataset()

    ds = download_dataset(
        dataset_id,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
    )
    return dataset_cls(ds.cache_dir)

load_repo_dataset

load_repo_dataset(source: RepoSource | str, accession: str, *, filenames: list[str] | None = None, force_download: bool = False, show_progress: bool = True, max_workers: int = 4) -> MSCompressDataset

Trigger a repository import and return an MSCompressDataset once ready.

Convenience wrapper around download_repo_dataset that loads the downloaded files into an mscompress.datasets.torch.MSCompressDataset. Requires PyTorch to be installed.

Source code in src/msdatasets/download.py
def load_repo_dataset(
    source: RepoSource | str,
    accession: str,
    *,
    filenames: list[str] | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
) -> MSCompressDataset:
    """Trigger a repository import and return an `MSCompressDataset` once ready.

    Convenience wrapper around `download_repo_dataset` that loads the
    downloaded files into an `mscompress.datasets.torch.MSCompressDataset`.
    Requires PyTorch to be installed.
    """
    dataset_cls = _import_torch_dataset()
    ds = download_repo_dataset(
        source,
        accession,
        filenames=filenames,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
    )
    return dataset_cls(ds.cache_dir)

Configuration

msdatasets.config

Configuration: paths, URLs, and environment variables.

get_api_url

get_api_url() -> str

Return the base API URL.

Resolution: MS_API_URL env var, or the default production URL.

Source code in src/msdatasets/config.py
def get_api_url() -> str:
    """Return the base API URL.

    Resolution: ``MS_API_URL`` env var, or the default production URL.
    """
    url = os.environ.get("MS_API_URL", _DEFAULT_API_URL)
    log.debug("API URL: %s", url)
    return url

get_cache_dir

get_cache_dir() -> Path

Return the root cache directory for downloaded datasets.

Resolution order: 1. MS_DATASETS_CACHE env var 2. MS_HOME env var + /datasets 3. ~/.ms/datasets

Source code in src/msdatasets/config.py
def get_cache_dir() -> Path:
    """Return the root cache directory for downloaded datasets.

    Resolution order:
    1. ``MS_DATASETS_CACHE`` env var
    2. ``MS_HOME`` env var + ``/datasets``
    3. ``~/.ms/datasets``
    """
    if env := os.environ.get("MS_DATASETS_CACHE"):
        log.debug("Cache dir from MS_DATASETS_CACHE: %s", env)
        return Path(env)
    if env := os.environ.get("MS_HOME"):
        path = Path(env) / "datasets"
        log.debug("Cache dir from MS_HOME: %s", path)
        return path
    path = Path.home() / ".ms" / "datasets"
    log.debug("Cache dir (default): %s", path)
    return path

get_dataset_dir

get_dataset_dir(dataset_id: str) -> Path

Return the cache directory for a specific dataset.

Source code in src/msdatasets/config.py
def get_dataset_dir(dataset_id: str) -> Path:
    """Return the cache directory for a specific dataset."""
    return get_cache_dir() / dataset_id