Source code for tiatoolbox.utils.magic
"""Detection of file type via magic numbers / signatures.
Checks here are based on the file signature, not the file extension.
They are all intented to be fast and lightweight, and should not require
parsing the entire file. There may occationally be false positives which
should be caught when attemping to parse the file.
"""
from __future__ import annotations
import zipfile
from io import BytesIO
from pathlib import Path
from typing import BinaryIO
def _normalize_binaryio(
file: str | Path | bytes | BinaryIO | BytesIO,
*,
must_exist: bool,
) -> BinaryIO:
"""Normalize the input to a BinaryIO object.
To be used in a context manager so that the io is closed after use.
Args:
file (str or Path or bytes or BinaryIO):
The file to normalize.
must_exist (bool, optional):
Whether the file must exist. Defaults to False.
Returns:
BinaryIO
The file as a BinaryIO object.
"""
if isinstance(file, (str, Path)):
path = Path(file)
if not path.exists():
if must_exist:
msg = f"File {path} does not exist."
raise FileNotFoundError(msg)
return BytesIO()
return Path.open(path, mode="rb") # -- intentional
if isinstance(file, (BinaryIO, BytesIO)):
return file
if isinstance(file, bytes):
return BytesIO(file)
msg = (
f"Input must be a str, Path, bytes, or BinaryIO. "
f"Received {type(file).__name__}."
)
raise TypeError(
msg,
)
[docs]
def is_dir(file: str | Path | bytes | BinaryIO | BytesIO) -> bool:
"""Check if file is a directory.
Thin wrapper around `Path.is_dir()` to handle multiple input types.
Args:
file (Union[str, Path, bytes]):
The file to check.
Returns:
bool:
A boolean indicating whether file is a directory.
"""
return Path(file).is_dir() if isinstance(file, (str, Path)) else False
[docs]
def is_sqlite3(file: str | Path | bytes | BinaryIO | BytesIO) -> bool:
"""Check if a file is a SQLite database.
Args:
file (Union[str, Path, bytes]):
The file to check.
Returns:
bool:
A boolean indicating whether file is a SQLite database.
"""
if is_dir(file):
return False
with _normalize_binaryio(file, must_exist=False) as io:
return io.read(16) == b"SQLite format 3\x00"
[docs]
def is_zip(file: str | Path | bytes | BytesIO) -> bool:
"""Check if a file is a ZIP archive.
Args:
file (Union[str, Path, bytes]):
The file to check.
"""
if is_dir(file):
return False
with _normalize_binaryio(file, must_exist=False) as io:
return zipfile.is_zipfile(io)
[docs]
def is_dcm(file: str | Path | bytes | BytesIO) -> bool:
"""Determines whether the given file is a DICOM file.
Checks if the first 128 bytes of the file contain the 'DICM'
preamble. Returns True if it is a DCM file, False otherwise.
This intentionally does not parse the file with `pydicom.dcmread()`
to avoid the overhead of parsing the entire file. Parsing .dcm files
can be slow for VL Whole Slide Images in some cases (e.g. sparse
tiling).
Args:
file (Union[str, Path, bytes, BytesIO]):
A string, Path, bytes, or BytesIO object representing the
path or binary data of the file.
Returns:
bool:
A boolean indicating whether the file is a .dcm file or not.
"""
if is_dir(file):
return False
with _normalize_binaryio(file, must_exist=False) as io:
io.seek(128) # Preamble should be ignored for security reasons
return io.read(4) == b"DICM"