diff --git a/.gitignore b/.gitignore index f5362e2..3517fb7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # project-specific tmp/ +test-download/ vault-token.dat # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index 171590c..88b73e5 100644 --- a/README.md +++ b/README.md @@ -174,6 +174,10 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD Note: Vault tokens are only required for certain protected Databus hosts (for example: `data.dbpedia.io`, `data.dev.dbpedia.link`). The client now detects those hosts and will fail early with a clear message if a token is required but not provided. Do not pass `--vault-token` for public downloads. - `--databus-key` - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. +- `--convert-to` + - Enables on-the-fly compression format conversion during download. Supported formats: `bz2`, `gz`, `xz`. Downloaded files will be automatically decompressed and recompressed to the target format. Example: `--convert-to gz` converts all downloaded compressed files to gzip format. +- `--convert-from` + - Optional filter to specify which source compression format should be converted. Use with `--convert-to` to convert only files with a specific compression format. Example: `--convert-to gz --convert-from bz2` converts only `.bz2` files to `.gz`, leaving other formats unchanged. **Help and further information on download command:** ```bash @@ -186,23 +190,33 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download --help Usage: databusclient download [OPTIONS] DATABUSURIS... Download datasets from databus, optionally using vault access if vault - options are provided. + options are provided. Supports on-the-fly compression format conversion + using --convert-to and --convert-from options. Options: - --localdir TEXT Local databus folder (if not given, databus folder - structure is created in current working directory) - --databus TEXT Databus URL (if not given, inferred from databusuri, - e.g. https://databus.dbpedia.org/sparql) - --vault-token TEXT Path to Vault refresh token file - --databus-key TEXT Databus API key to download from protected databus - --all-versions When downloading artifacts, download all versions - instead of only the latest - --authurl TEXT Keycloak token endpoint URL [default: - https://auth.dbpedia.org/realms/dbpedia/protocol/openid- - connect/token] - --clientid TEXT Client ID for token exchange [default: vault-token- - exchange] - --help Show this message and exit. + --localdir TEXT Local databus folder (if not given, databus + folder structure is created in current working + directory) + --databus TEXT Databus URL (if not given, inferred from + databusuri, e.g. + https://databus.dbpedia.org/sparql) + --vault-token TEXT Path to Vault refresh token file + --databus-key TEXT Databus API key to download from protected + databus + --all-versions When downloading artifacts, download all + versions instead of only the latest + --authurl TEXT Keycloak token endpoint URL [default: + https://auth.dbpedia.org/realms/dbpedia/protocol + /openid-connect/token] + --clientid TEXT Client ID for token exchange [default: vault- + token-exchange] + --convert-to [bz2|gz|xz] Target compression format for on-the-fly + conversion during download (supported: bz2, gz, + xz) + --convert-from [bz2|gz|xz] Source compression format to convert from + (optional filter). Only files with this + compression will be converted. + --help Show this message and exit. ``` #### Examples of using the download command @@ -255,6 +269,18 @@ databusclient download 'PREFIX dcat: SELECT ?x WHER docker run --rm -v $(pwd):/data dbpedia/databus-python-client download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql ``` +**Download with Compression Conversion**: download files and convert them to a different compression format on-the-fly +```bash +# Convert all compressed files to gzip format +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 --convert-to gz + +# Convert only bz2 files to xz format, leaving other compressions unchanged +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals --convert-to xz --convert-from bz2 + +# Download a collection and unify all files to bz2 format +databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --convert-to bz2 +``` + ### Deploy diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 1a92d7f..0f1328c 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -1,7 +1,10 @@ import json import os +import bz2 +import gzip +import lzma +from typing import List, Optional, Tuple import re -from typing import List from urllib.parse import urlparse import requests @@ -11,9 +14,138 @@ from databusclient.api.utils import ( fetch_databus_jsonld, get_databus_id_parts_from_file_url, + compute_sha256_and_length, ) -from databusclient.api.utils import compute_sha256_and_length +# Compression format mappings +COMPRESSION_EXTENSIONS = { + "bz2": ".bz2", + "gz": ".gz", + "xz": ".xz", +} + +COMPRESSION_MODULES = { + "bz2": bz2, + "gz": gzip, + "xz": lzma, +} + + +def _detect_compression_format(filename: str) -> Optional[str]: + """Detect compression format from file extension. + + Args: + filename: Name of the file. + + Returns: + Compression format string ('bz2', 'gz', 'xz') or None if not compressed. + """ + filename_lower = filename.lower() + for fmt, ext in COMPRESSION_EXTENSIONS.items(): + if filename_lower.endswith(ext): + return fmt + return None + + +def _should_convert_file( + filename: str, convert_to: Optional[str], convert_from: Optional[str] +) -> Tuple[bool, Optional[str]]: + """Determine if a file should be converted and what the source format is. + + Args: + filename: Name of the file. + convert_to: Target compression format ('bz2', 'gz', 'xz'). + convert_from: Optional source compression format filter. + + Returns: + Tuple of (should_convert: bool, source_format: Optional[str]). + """ + if not convert_to: + return False, None + + source_format = _detect_compression_format(filename) + + # If file is not compressed, don't convert + if source_format is None: + return False, None + + # If source and target are the same, skip conversion + if source_format == convert_to: + return False, None + + # If convert_from is specified, only convert matching formats + if convert_from and source_format != convert_from: + return False, None + + return True, source_format + + +def _get_converted_filename(filename: str, source_format: str, target_format: str) -> str: + """Generate the new filename after compression format conversion. + + Args: + filename: Original filename. + source_format: Source compression format ('bz2', 'gz', 'xz'). + target_format: Target compression format ('bz2', 'gz', 'xz'). + + Returns: + New filename with updated extension. + """ + source_ext = COMPRESSION_EXTENSIONS[source_format] + target_ext = COMPRESSION_EXTENSIONS[target_format] + + # Handle case-insensitive extension matching + if filename.lower().endswith(source_ext): + return filename[:-len(source_ext)] + target_ext + return filename + target_ext + + +def _convert_compression_format( + source_file: str, target_file: str, source_format: str, target_format: str +) -> None: + """Convert a compressed file from one format to another. + + Args: + source_file: Path to source compressed file. + target_file: Path to target compressed file. + source_format: Source compression format ('bz2', 'gz', 'xz'). + target_format: Target compression format ('bz2', 'gz', 'xz'). + + Raises: + ValueError: If source_format or target_format is not supported. + RuntimeError: If compression conversion fails. + """ + # Validate compression formats + if source_format not in COMPRESSION_MODULES: + raise ValueError(f"Unsupported source compression format: {source_format}. Supported formats: {list(COMPRESSION_MODULES.keys())}") + if target_format not in COMPRESSION_MODULES: + raise ValueError(f"Unsupported target compression format: {target_format}. Supported formats: {list(COMPRESSION_MODULES.keys())}") + + source_module = COMPRESSION_MODULES[source_format] + target_module = COMPRESSION_MODULES[target_format] + + print(f"Converting {source_format} → {target_format}: {os.path.basename(source_file)}") + + # Decompress and recompress with progress indication + chunk_size = 8192 + + try: + with source_module.open(source_file, 'rb') as sf: + with target_module.open(target_file, 'wb') as tf: + while True: + chunk = sf.read(chunk_size) + if not chunk: + break + tf.write(chunk) + + # Remove the original file after successful conversion + os.remove(source_file) + print(f"Conversion complete: {os.path.basename(target_file)}") + except Exception as e: + # If conversion fails, ensure the partial target file is removed + if os.path.exists(target_file): + os.remove(target_file) + raise RuntimeError(f"Compression conversion failed: {e}") # compiled regex for SHA-256 hex strings _SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$") @@ -146,6 +278,8 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, + convert_to=None, + convert_from=None, validate_checksum: bool = False, expected_checksum: str | None = None, ) -> None: @@ -158,6 +292,10 @@ def _download_file( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. + convert_to: Target compression format for on-the-fly conversion. + convert_from: Optional source compression format filter. + validate_checksum: Whether to validate checksums after downloading. + expected_checksum: The expected checksum of the file. """ if localDir is None: _host, account, group, artifact, version, file = ( @@ -288,16 +426,22 @@ def _download_file( block_size = 1024 # 1 KiB progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) - with open(filename, "wb") as file: + with open(filename, "wb") as f: for data in response.iter_content(block_size): progress_bar.update(len(data)) - file.write(data) + f.write(data) progress_bar.close() # --- 5. Verify download size --- if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: raise IOError("Downloaded size does not match Content-Length header") + # --- 6. Convert compression format if requested --- + should_convert, source_format = _should_convert_file(file, convert_to, convert_from) + if should_convert and source_format: + target_filename = _get_converted_filename(file, source_format, convert_to) + target_filepath = os.path.join(localDir, target_filename) + _convert_compression_format(filename, target_filepath, source_format, convert_to) # --- 6. Optional checksum validation --- if validate_checksum: # reuse compute_sha256_and_length from webdav extension @@ -329,6 +473,8 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, + convert_to: str = None, + convert_from: str = None, validate_checksum: bool = False, checksums: dict | None = None, ) -> None: @@ -341,6 +487,10 @@ def _download_files( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. + convert_to: Target compression format for on-the-fly conversion. + convert_from: Optional source compression format filter. + validate_checksum: Whether to validate checksums after downloading. + checksums: Dictionary mapping URLs to their expected checksums. """ for url in urls: expected = None @@ -353,6 +503,8 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -499,7 +651,9 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, - validate_checksum: bool = False + convert_to: str = None, + convert_from: str = None, + validate_checksum: bool = False, ) -> None: """Download all files in a databus collection. @@ -511,6 +665,9 @@ def _download_collection( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. + convert_to: Target compression format for on-the-fly conversion. + convert_from: Optional source compression format filter. + validate_checksum: Whether to validate checksums after downloading. """ query = _get_sparql_query_of_collection(uri, databus_key=databus_key) file_urls = _get_file_download_urls_from_sparql_query( @@ -529,6 +686,8 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) @@ -541,6 +700,8 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, + convert_to: str = None, + convert_from: str = None, validate_checksum: bool = False, ) -> None: """Download all files in a databus artifact version. @@ -552,6 +713,9 @@ def _download_version( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. + convert_to: Target compression format for on-the-fly conversion. + convert_from: Optional source compression format filter. + validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) @@ -569,6 +733,8 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, checksums=checksums, ) @@ -582,6 +748,8 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, + convert_to: str = None, + convert_from: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus artifact. @@ -594,6 +762,9 @@ def _download_artifact( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. + convert_to: Target compression format for on-the-fly conversion. + convert_from: Optional source compression format filter. + validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) versions = _get_databus_versions_of_artifact(json_str, all_versions=all_versions) @@ -617,6 +788,8 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, checksums=checksums, ) @@ -662,8 +835,6 @@ def _get_databus_versions_of_artifact( def _get_file_download_urls_from_artifact_jsonld(json_str: str) -> List[str]: """Parse the JSON-LD of a databus artifact version to extract download URLs. - - Don't get downloadURLs directly from the JSON-LD, but follow the "file" links to count access to databus accurately. Args: json_str: JSON-LD string of the databus artifact version. @@ -693,6 +864,8 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, + convert_to: str = None, + convert_from: str = None, validate_checksum: bool = False, ) -> None: """Download files in a databus group. @@ -705,6 +878,9 @@ def _download_group( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. client_id: Client ID for token exchange. + convert_to: Target compression format for on-the-fly conversion. + convert_from: Optional source compression format filter. + validate_checksum: Whether to validate checksums after downloading. """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) artifacts = _get_databus_artifacts_of_group(json_str) @@ -718,6 +894,8 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, ) @@ -765,6 +943,8 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", + convert_to=None, + convert_from=None, validate_checksum: bool = False ) -> None: """Download datasets from databus. @@ -779,6 +959,9 @@ def download( databus_key: Databus API key for protected downloads. auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". client_id: Client ID for token exchange. Default is "vault-token-exchange". + convert_to: Target compression format for on-the-fly conversion (supported: bz2, gz, xz). + convert_from: Optional source compression format filter. + validate_checksum: Whether to validate checksums after downloading. """ for databusURI in databusURIs: host, account, group, artifact, version, file = ( @@ -805,6 +988,8 @@ def download( databus_key, auth_url, client_id, + convert_to, + convert_from, validate_checksum=validate_checksum, ) elif file is not None: @@ -831,6 +1016,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, expected_checksum=expected, ) @@ -843,6 +1030,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, ) elif artifact is not None: @@ -857,6 +1046,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, ) elif group is not None and group != "collections": @@ -871,6 +1062,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, ) elif account is not None: @@ -905,6 +1098,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + convert_to=convert_to, + convert_from=convert_from, validate_checksum=validate_checksum, checksums=checksums if checksums else None, ) diff --git a/databusclient/cli.py b/databusclient/cli.py index f71c823..a8ce77c 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -162,6 +162,16 @@ def deploy( show_default=True, help="Client ID for token exchange", ) +@click.option( + "--convert-to", + type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), + help="Target compression format for on-the-fly conversion during download (supported: bz2, gz, xz)", +) +@click.option( + "--convert-from", + type=click.Choice(["bz2", "gz", "xz"], case_sensitive=False), + help="Source compression format to convert from (optional filter). Only files with this compression will be converted.", +) @click.option( "--validate-checksum", is_flag=True, @@ -176,10 +186,13 @@ def download( all_versions, authurl, clientid, + convert_to, + convert_from, validate_checksum, -): +): """ Download datasets from databus, optionally using vault access if vault options are provided. + Supports on-the-fly compression format conversion using --convert-to and --convert-from options. """ try: api_download( @@ -191,8 +204,10 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, - validate_checksum=validate_checksum - ) + convert_to=convert_to, + convert_from=convert_from, + validate_checksum=validate_checksum, + ) except DownloadAuthError as e: raise click.ClickException(str(e)) diff --git a/tests/test_compression_conversion.py b/tests/test_compression_conversion.py new file mode 100644 index 0000000..a8c7618 --- /dev/null +++ b/tests/test_compression_conversion.py @@ -0,0 +1,198 @@ +"""Tests for on-the-fly compression conversion feature""" + +import os +import gzip +import bz2 +import lzma +import tempfile +import pytest +from databusclient.api.download import ( + _detect_compression_format, + _should_convert_file, + _get_converted_filename, + _convert_compression_format, +) + + +def test_detect_compression_format(): + """Test compression format detection from filenames""" + assert _detect_compression_format("file.txt.bz2") == "bz2" + assert _detect_compression_format("file.txt.gz") == "gz" + assert _detect_compression_format("file.txt.xz") == "xz" + assert _detect_compression_format("file.txt") is None + assert _detect_compression_format("FILE.TXT.GZ") == "gz" # case insensitive + + +def test_should_convert_file(): + """Test file conversion decision logic""" + # No conversion target specified + should_convert, source = _should_convert_file("file.txt.bz2", None, None) + assert should_convert is False + assert source is None + + # Uncompressed file + should_convert, source = _should_convert_file("file.txt", "gz", None) + assert should_convert is False + assert source is None + + # Same source and target + should_convert, source = _should_convert_file("file.txt.gz", "gz", None) + assert should_convert is False + assert source is None + + # Valid conversion + should_convert, source = _should_convert_file("file.txt.bz2", "gz", None) + assert should_convert is True + assert source == "bz2" + + # With convert_from filter matching + should_convert, source = _should_convert_file("file.txt.bz2", "gz", "bz2") + assert should_convert is True + assert source == "bz2" + + # With convert_from filter not matching + should_convert, source = _should_convert_file("file.txt.bz2", "gz", "xz") + assert should_convert is False + assert source is None + + +def test_get_converted_filename(): + """Test filename conversion""" + assert _get_converted_filename("data.txt.bz2", "bz2", "gz") == "data.txt.gz" + assert _get_converted_filename("data.txt.gz", "gz", "xz") == "data.txt.xz" + assert _get_converted_filename("data.txt.xz", "xz", "bz2") == "data.txt.bz2" + + +def test_convert_compression_format(): + """Test actual compression format conversion""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create test data + test_data = b"This is test data for compression conversion " * 100 + + # Create a bz2 file + bz2_file = os.path.join(tmpdir, "test.txt.bz2") + with bz2.open(bz2_file, 'wb') as f: + f.write(test_data) + + # Convert bz2 to gz + gz_file = os.path.join(tmpdir, "test.txt.gz") + _convert_compression_format(bz2_file, gz_file, "bz2", "gz") + + # Verify the original file was removed + assert not os.path.exists(bz2_file) + + # Verify the new file exists and contains the same data + assert os.path.exists(gz_file) + with gzip.open(gz_file, 'rb') as f: + decompressed = f.read() + assert decompressed == test_data + + +def test_convert_gz_to_xz(): + """Test conversion from gzip to xz""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create test data + test_data = b"Conversion test: gz to xz format" * 50 + + # Create a gz file + gz_file = os.path.join(tmpdir, "test.txt.gz") + with gzip.open(gz_file, 'wb') as f: + f.write(test_data) + + # Convert gz to xz + xz_file = os.path.join(tmpdir, "test.txt.xz") + _convert_compression_format(gz_file, xz_file, "gz", "xz") + + # Verify conversion + assert not os.path.exists(gz_file) + assert os.path.exists(xz_file) + with lzma.open(xz_file, 'rb') as f: + decompressed = f.read() + assert decompressed == test_data + + +def test_convert_xz_to_bz2(): + """Test conversion from xz to bz2""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create test data + test_data = b"XZ to BZ2 compression conversion test" * 75 + + # Create an xz file + xz_file = os.path.join(tmpdir, "test.txt.xz") + with lzma.open(xz_file, 'wb') as f: + f.write(test_data) + + # Convert xz to bz2 + bz2_file = os.path.join(tmpdir, "test.txt.bz2") + _convert_compression_format(xz_file, bz2_file, "xz", "bz2") + + # Verify conversion + assert not os.path.exists(xz_file) + assert os.path.exists(bz2_file) + with bz2.open(bz2_file, 'rb') as f: + decompressed = f.read() + assert decompressed == test_data + + +def test_case_insensitive_filename_conversion(): + """Test that uppercase extensions are handled correctly (addresses PR feedback)""" + # Test uppercase extension matching + assert _get_converted_filename("FILE.BZ2", "bz2", "gz") == "FILE.gz" + assert _get_converted_filename("data.GZ", "gz", "xz") == "data.xz" + assert _get_converted_filename("archive.XZ", "xz", "bz2") == "archive.bz2" + + # Test mixed case + assert _get_converted_filename("File.Bz2", "bz2", "gz") == "File.gz" + + +def test_invalid_source_format_validation(): + """Test that invalid source format raises ValueError (addresses PR feedback)""" + with tempfile.TemporaryDirectory() as tmpdir: + source_file = os.path.join(tmpdir, "test.zip") + target_file = os.path.join(tmpdir, "test.gz") + + # Create a dummy file + with open(source_file, 'wb') as f: + f.write(b"test data") + + # Should raise ValueError for unsupported format + with pytest.raises(ValueError, match="Unsupported source compression format"): + _convert_compression_format(source_file, target_file, "zip", "gz") + + +def test_invalid_target_format_validation(): + """Test that invalid target format raises ValueError (addresses PR feedback)""" + with tempfile.TemporaryDirectory() as tmpdir: + source_file = os.path.join(tmpdir, "test.gz") + target_file = os.path.join(tmpdir, "test.rar") + + # Create a valid gz file + test_data = b"test data" + with gzip.open(source_file, 'wb') as f: + f.write(test_data) + + # Should raise ValueError for unsupported format + with pytest.raises(ValueError, match="Unsupported target compression format"): + _convert_compression_format(source_file, target_file, "gz", "rar") + + +def test_corrupted_file_handling(): + """Test that corrupted files are handled gracefully and target file is cleaned up""" + with tempfile.TemporaryDirectory() as tmpdir: + source_file = os.path.join(tmpdir, "corrupted.bz2") + target_file = os.path.join(tmpdir, "target.gz") + + # Create a file with .bz2 extension but invalid content + with open(source_file, 'wb') as f: + f.write(b"This is not valid bz2 compressed data") + + # Should raise RuntimeError + with pytest.raises(RuntimeError, match="Compression conversion failed"): + _convert_compression_format(source_file, target_file, "bz2", "gz") + + # Verify target file was cleaned up + assert not os.path.exists(target_file) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_download.py b/tests/test_download.py index 87d49dc..299a81c 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -27,6 +27,7 @@ def test_with_query(): api_download("tmp", DEFAULT_ENDPOINT, [TEST_QUERY]) +@pytest.mark.skip(reason="Live collection download is long-running and flakes on network timeouts") @pytest.mark.skip(reason="Integration test: requires live databus.dbpedia.org connection") def test_with_collection(): api_download("tmp", DEFAULT_ENDPOINT, [TEST_COLLECTION])