API Reference¶

Auto-generated reference for the public API, rendered by mkdocstrings.

For task-oriented examples see the Usage guide.

Package¶

A unified dataset framework for mass spectrometry.

DatasetNotFoundError ¶

Bases: Exception

Raised when the server returns 404 for a dataset.

DownloadError ¶

Bases: Exception

Raised on network or server failures during download.

ExtractionError ¶

Bases: DownloadError

Raised when a server-side extraction task fails.

Dataset `dataclass` ¶

Dataset(dataset_id: str, dataset_name: str | None, cache_dir: Path, files: list[Path] = list())

Result object returned by load_dataset.

Supports len(), indexing, and iteration over downloaded file paths.

RepoSource ¶

Bases: str, Enum

Supported repository sources for dataset imports.

download_dataset ¶

download_dataset(dataset_id: str, *, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, filenames: list[str] | None = None, store_as: StoreFormat = 'mszx', output_dir: Path | None = None) -> Dataset

Download a dataset and return a Dataset pointing to local files.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	Server-side dataset identifier (UUID).	required
`force_download`	`bool`	Re-download parts even if they already exist on disk.	`False`
`show_progress`	`bool`	Show a `rich` progress bar during download.	`True`
`max_workers`	`int`	Maximum number of parallel downloads.	`4`
`filenames`	`list[str] \| None`	Optional list of filenames to include. When provided, the server returns a manifest containing only matching parts.	`None`
`store_as`	`StoreFormat`	On-disk format for downloaded parts. Defaults to `"mszx"` (the raw archive shipped by the server). Set to `"msz"` to extract the inner MSZ, or `"mzml"` to decompress fully to mzML. Conversion is handled by mstransfer.	`'mszx'`
`output_dir`	`Path \| None`	Optional destination directory. When set, files are written directly here (no `{dataset_id}` subdir) and the cache root from `get_dataset_dir` is bypassed. Useful for one-off downloads outside the shared cache.	`None`

Source code in src/msdatasets/download.py

def download_dataset(
    dataset_id: str,
    *,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    filenames: list[str] | None = None,
    store_as: StoreFormat = "mszx",
    output_dir: Path | None = None,
) -> Dataset:
    """Download a dataset and return a `Dataset` pointing to local files.

    Parameters
    ----------
    dataset_id:
        Server-side dataset identifier (UUID).
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    filenames:
        Optional list of filenames to include. When provided, the server
        returns a manifest containing only matching parts.
    store_as:
        On-disk format for downloaded parts.  Defaults to ``"mszx"`` (the
        raw archive shipped by the server).  Set to ``"msz"`` to extract
        the inner MSZ, or ``"mzml"`` to decompress fully to mzML.
        Conversion is handled by mstransfer.
    output_dir:
        Optional destination directory.  When set, files are written
        directly here (no ``{dataset_id}`` subdir) and the cache root
        from `get_dataset_dir` is bypassed.  Useful for one-off
        downloads outside the shared cache.
    """
    log.info("Downloading dataset %s", dataset_id)
    dataset_dir = output_dir if output_dir is not None else get_dataset_dir(dataset_id)
    dataset_dir.mkdir(parents=True, exist_ok=True)
    log.debug("Dataset directory: %s", dataset_dir)

    async def _fetch_and_extract() -> tuple[Manifest, list[Path], list[DatasetPart]]:
        timeout = httpx.Timeout(10.0, read=None)
        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
            manifest_ = await fetch_manifest(
                dataset_id, filenames=filenames, client=client
            )

            # Persist manifest locally for offline inspection
            manifest_path = dataset_dir / "manifest.json"
            manifest_path.write_text(manifest_.model_dump_json(indent=2))

            cached: list[Path] = []
            to_download: list[DatasetPart] = []

            # Skip files that are already on disk, unless *force_download* is True.
            # Cache is keyed by the target on-disk filename, so switching
            # --store-as triggers a re-download rather than using a stale
            # artifact in a different format.
            for part in manifest_.parts:
                dest = dataset_dir / _target_filename(part.filename, store_as)
                if dest.exists() and not force_download:
                    log.debug("Cached, skipping: %s", dest.name)
                    cached.append(dest)
                else:
                    to_download.append(part)

            # For files that need to be downloaded,
            # ensure they are extracted and ready on the server.
            if to_download:
                log.info(
                    "Downloading %d/%d part(s)",
                    len(to_download),
                    manifest_.total_parts,
                )
                await ensure_all_extracted(client, to_download)
            else:
                log.info("All %d part(s) already cached", manifest_.total_parts)

            return manifest_, cached, to_download

    manifest, files, parts_to_download = asyncio.run(_fetch_and_extract())

    # Download all ready files via mstransfer.
    if parts_to_download:
        base_url = get_api_url()
        requests = [
            DownloadRequest(
                url=f"{base_url}{part.download_url}",
                dest=dataset_dir / part.filename,
            )
            for part in parts_to_download
        ]

        progress_bar: Progress | None = None
        batch_progress: _RichBatchProgress | None = None
        if show_progress:
            progress_bar = Progress(
                TextColumn("[bold blue]{task.description}"),
                BarColumn(),
                DownloadColumn(),
                TransferSpeedColumn(),
            )
            batch_progress = _RichBatchProgress(progress_bar)

        ctx = progress_bar if progress_bar is not None else _NullContext()
        with ctx:
            downloaded = download_batch(
                requests,
                store_as=store_as,
                parallel=max_workers,
                progress=batch_progress,
            )
            files.extend(downloaded)

    # Ensure files are ordered by part_index.  `files` entries use the
    # target extension (set by mstransfer), so key the lookup by that name.
    file_map = {p.name: p for p in files}
    ordered_files = [
        file_map[target]
        for part in manifest.parts
        if (target := _target_filename(part.filename, store_as)) in file_map
    ]

    ds = Dataset(
        dataset_id=manifest.dataset_id,
        dataset_name=manifest.dataset_name,
        cache_dir=dataset_dir,
        files=ordered_files,
    )
    log.info("Dataset ready: %d file(s) in %s", len(ds), dataset_dir)
    return ds

download_repo_dataset ¶

download_repo_dataset(source: RepoSource | str, accession: str, *, filenames: list[str] | None = None, force_download: bool = False, show_progress: bool = True, max_workers: int = 4, store_as: StoreFormat = 'mszx', output_dir: Path | None = None) -> Dataset

Trigger a repository import and download the resulting dataset.

Posts to /repositories/{source}/projects/{accession}/dataset to create a dataset from a PRIDE or MassIVE project. The endpoint is idempotent—calling it for an already-imported project returns the existing dataset and job statuses.

Parameters:

Name	Type	Description	Default
`source`	`RepoSource \| str`	Repository source (`"pride"` or `"massive"`).	required
`accession`	`str`	Project accession (e.g. `PXD075509` for PRIDE, `MSV000078787` for MassIVE).	required
`filenames`	`list[str] \| None`	Optional list of specific filenames to import. When None, all files in the project are imported.	`None`
`force_download`	`bool`	Re-download parts even if they already exist on disk.	`False`
`show_progress`	`bool`	Show a `rich` progress bar during download.	`True`
`max_workers`	`int`	Maximum number of parallel downloads.	`4`
`store_as`	`StoreFormat`	On-disk format for downloaded parts. See `download_dataset`.	`'mszx'`
`output_dir`	`Path \| None`	Optional destination directory. See `download_dataset`.	`None`

Source code in src/msdatasets/download.py

def download_repo_dataset(
    source: RepoSource | str,
    accession: str,
    *,
    filenames: list[str] | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
    store_as: StoreFormat = "mszx",
    output_dir: Path | None = None,
) -> Dataset:
    """Trigger a repository import and download the resulting dataset.

    Posts to ``/repositories/{source}/projects/{accession}/dataset`` to create
    a dataset from a PRIDE or MassIVE project.  The endpoint is
    idempotent—calling it for an already-imported project returns the
    existing dataset and job statuses.

    Parameters
    ----------
    source:
        Repository source (``"pride"`` or ``"massive"``).
    accession:
        Project accession (e.g. ``PXD075509`` for PRIDE, ``MSV000078787`` for
        MassIVE).
    filenames:
        Optional list of specific filenames to import. When *None*, all
        files in the project are imported.
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    store_as:
        On-disk format for downloaded parts.  See `download_dataset`.
    output_dir:
        Optional destination directory.  See `download_dataset`.
    """
    source = RepoSource(source)
    console = Console(stderr=True)

    async def _import() -> str:
        timeout = httpx.Timeout(10.0, read=None)
        async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
            if show_progress:
                with console.status(
                    f"[bold blue]{source.value} import {accession}: pending…",
                    spinner="dots",
                ) as spinner:

                    def _on_status(file_name: str, status: RepoImportStatus) -> None:
                        label = _STATUS_LABELS.get(status, status.value)
                        spinner.update(f"[bold blue]{file_name}: {label}…")

                    result = await trigger_repo_import(
                        source,
                        accession,
                        filenames=filenames,
                        client=client,
                        on_status=_on_status,
                    )
            else:
                result = await trigger_repo_import(
                    source, accession, filenames=filenames, client=client
                )
            return result.dataset_id

    dataset_id = asyncio.run(_import())

    return download_dataset(
        dataset_id,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
        filenames=filenames,
        store_as=store_as,
        output_dir=output_dir,
    )

load_dataset ¶

load_dataset(dataset_id: str, *, force_download: bool = False, show_progress: bool = True, max_workers: int = 4) -> MSCompressDataset

Download a dataset and return an MSCompressDataset.

Convenience wrapper around download_dataset that loads the downloaded files into an mscompress.datasets.torch.MSCompressDataset ready for iteration. Requires PyTorch to be installed.

If dataset_id matches the pattern {source}/<accession> (e.g. pride/PXD075509 or massive/MSV000078787), the repository import flow is used instead. A specific filename subset may be specified in square brackets: pride/PXD000001[file1.raw,file2.mzML].

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	Server-side dataset identifier, or a repository specifier like `pride/PXD075509` or `massive/MSV000078787`.	required
`force_download`	`bool`	Re-download parts even if they already exist on disk.	`False`
`show_progress`	`bool`	Show a `rich` progress bar during download.	`True`
`max_workers`	`int`	Maximum number of parallel downloads.	`4`

Source code in src/msdatasets/download.py

def load_dataset(
    dataset_id: str,
    *,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
) -> MSCompressDataset:
    """Download a dataset and return an `MSCompressDataset`.

    Convenience wrapper around `download_dataset` that loads the
    downloaded files into an `mscompress.datasets.torch.MSCompressDataset`
    ready for iteration.  Requires PyTorch to be installed.

    If *dataset_id* matches the pattern ``{source}/<accession>`` (e.g.
    ``pride/PXD075509`` or ``massive/MSV000078787``), the repository import
    flow is used instead.  A specific filename subset may be specified in
    square brackets: ``pride/PXD000001[file1.raw,file2.mzML]``.

    Parameters
    ----------
    dataset_id:
        Server-side dataset identifier, or a repository specifier like
        ``pride/PXD075509`` or ``massive/MSV000078787``.
    force_download:
        Re-download parts even if they already exist on disk.
    show_progress:
        Show a ``rich`` progress bar during download.
    max_workers:
        Maximum number of parallel downloads.
    """
    repo_spec = _parse_repo_spec(dataset_id)
    if repo_spec is not None:
        source, accession, filenames = repo_spec
        return load_repo_dataset(
            source,
            accession,
            filenames=filenames,
            force_download=force_download,
            show_progress=show_progress,
            max_workers=max_workers,
        )

    dataset_cls = _import_torch_dataset()

    ds = download_dataset(
        dataset_id,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
    )
    return dataset_cls(ds.cache_dir)

load_repo_dataset ¶

load_repo_dataset(source: RepoSource | str, accession: str, *, filenames: list[str] | None = None, force_download: bool = False, show_progress: bool = True, max_workers: int = 4) -> MSCompressDataset

Trigger a repository import and return an MSCompressDataset once ready.

Convenience wrapper around download_repo_dataset that loads the downloaded files into an mscompress.datasets.torch.MSCompressDataset. Requires PyTorch to be installed.

Source code in src/msdatasets/download.py

def load_repo_dataset(
    source: RepoSource | str,
    accession: str,
    *,
    filenames: list[str] | None = None,
    force_download: bool = False,
    show_progress: bool = True,
    max_workers: int = 4,
) -> MSCompressDataset:
    """Trigger a repository import and return an `MSCompressDataset` once ready.

    Convenience wrapper around `download_repo_dataset` that loads the
    downloaded files into an `mscompress.datasets.torch.MSCompressDataset`.
    Requires PyTorch to be installed.
    """
    dataset_cls = _import_torch_dataset()
    ds = download_repo_dataset(
        source,
        accession,
        filenames=filenames,
        force_download=force_download,
        show_progress=show_progress,
        max_workers=max_workers,
    )
    return dataset_cls(ds.cache_dir)

Configuration¶

msdatasets.config ¶

Configuration: paths, URLs, and environment variables.

get_api_url ¶

get_api_url() -> str

Return the base API URL.

Resolution: MS_API_URL env var, or the default production URL.

Source code in src/msdatasets/config.py

def get_api_url() -> str:
    """Return the base API URL.

    Resolution: ``MS_API_URL`` env var, or the default production URL.
    """
    url = os.environ.get("MS_API_URL", _DEFAULT_API_URL)
    log.debug("API URL: %s", url)
    return url

get_cache_dir ¶

get_cache_dir() -> Path

Return the root cache directory for downloaded datasets.

Resolution order: 1. MS_DATASETS_CACHE env var 2. MS_HOME env var + /datasets 3. ~/.ms/datasets

Source code in src/msdatasets/config.py

def get_cache_dir() -> Path:
    """Return the root cache directory for downloaded datasets.

    Resolution order:
    1. ``MS_DATASETS_CACHE`` env var
    2. ``MS_HOME`` env var + ``/datasets``
    3. ``~/.ms/datasets``
    """
    if env := os.environ.get("MS_DATASETS_CACHE"):
        log.debug("Cache dir from MS_DATASETS_CACHE: %s", env)
        return Path(env)
    if env := os.environ.get("MS_HOME"):
        path = Path(env) / "datasets"
        log.debug("Cache dir from MS_HOME: %s", path)
        return path
    path = Path.home() / ".ms" / "datasets"
    log.debug("Cache dir (default): %s", path)
    return path

get_dataset_dir ¶

get_dataset_dir(dataset_id: str) -> Path

Return the cache directory for a specific dataset.

Source code in src/msdatasets/config.py

def get_dataset_dir(dataset_id: str) -> Path:
    """Return the cache directory for a specific dataset."""
    return get_cache_dir() / dataset_id

API Reference¶

Package¶

DatasetNotFoundError ¶

DownloadError ¶

ExtractionError ¶

Dataset dataclass ¶

RepoSource ¶

download_dataset ¶

download_repo_dataset ¶

load_dataset ¶

load_repo_dataset ¶

Configuration¶

msdatasets.config ¶

get_api_url ¶

get_cache_dir ¶

get_dataset_dir ¶

Dataset `dataclass` ¶