diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index d4c20a402..874b71cab 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -45,7 +45,7 @@ all = [ "olefile", "pydub", "SpeechRecognition", - "youtube-transcript-api~=1.0.0", + "youtube-transcript-api>=1.0.0", "azure-ai-documentintelligence", "azure-ai-contentunderstanding>=1.2.0b1", "azure-identity", diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ccb44b64b..d737723b1 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -264,12 +264,16 @@ def _handle_output(args, result: DocumentConverterResult): with open(args.output, "w", encoding="utf-8") as f: f.write(result.markdown) else: - # Handle stdout encoding errors more gracefully - print( - result.markdown.encode(sys.stdout.encoding, errors="replace").decode( - sys.stdout.encoding + # Handle stdout encoding errors more gracefully by attempting to set UTF-8 + try: + sys.stdout.reconfigure(encoding="utf-8") + print(result.markdown) + except Exception: + # Fallback to replacing unencodable characters + encoding = sys.stdout.encoding or "utf-8" + print( + result.markdown.encode(encoding, errors="replace").decode(encoding) ) - ) def _exit_with_error(message: str): diff --git a/packages/markitdown/tests/test_cli_misc.py b/packages/markitdown/tests/test_cli_misc.py index cf6c9ccc7..c194ee6ae 100644 --- a/packages/markitdown/tests/test_cli_misc.py +++ b/packages/markitdown/tests/test_cli_misc.py @@ -2,13 +2,15 @@ import subprocess from markitdown import __version__ +import sys + # This file contains CLI tests that are not directly tested by the FileTestVectors. # This includes things like help messages, version numbers, and invalid flags. def test_version() -> None: result = subprocess.run( - ["python", "-m", "markitdown", "--version"], capture_output=True, text=True + [sys.executable, "-m", "markitdown", "--version"], capture_output=True, text=True, encoding="utf-8" ) assert result.returncode == 0, f"CLI exited with error: {result.stderr}" @@ -17,7 +19,7 @@ def test_version() -> None: def test_invalid_flag() -> None: result = subprocess.run( - ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True + [sys.executable, "-m", "markitdown", "--foobar"], capture_output=True, text=True, encoding="utf-8" ) assert result.returncode != 0, f"CLI exited with error: {result.stderr}" diff --git a/packages/markitdown/tests/test_cli_vectors.py b/packages/markitdown/tests/test_cli_vectors.py index 5add530b1..a1b13c9b6 100644 --- a/packages/markitdown/tests/test_cli_vectors.py +++ b/packages/markitdown/tests/test_cli_vectors.py @@ -4,6 +4,7 @@ import pytest import subprocess import locale +import sys from typing import List if __name__ == "__main__": @@ -46,13 +47,14 @@ def test_output_to_stdout(shared_tmp_dir, test_vector) -> None: result = subprocess.run( [ - "python", + sys.executable, "-m", "markitdown", os.path.join(TEST_FILES_DIR, test_vector.filename), ], capture_output=True, text=True, + encoding="utf-8", ) assert result.returncode == 0, f"CLI exited with error: {result.stderr}" @@ -69,7 +71,7 @@ def test_output_to_file(shared_tmp_dir, test_vector) -> None: output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") result = subprocess.run( [ - "python", + sys.executable, "-m", "markitdown", "-o", @@ -78,12 +80,13 @@ def test_output_to_file(shared_tmp_dir, test_vector) -> None: ], capture_output=True, text=True, + encoding="utf-8", ) assert result.returncode == 0, f"CLI exited with error: {result.stderr}" assert os.path.exists(output_file), f"Output file not created: {output_file}" - with open(output_file, "r") as f: + with open(output_file, "r", encoding="utf-8") as f: output_data = f.read() for test_string in test_vector.must_include: assert test_string in output_data @@ -104,7 +107,7 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None: result = subprocess.run( [ - "python", + sys.executable, "-m", "markitdown", os.path.join(TEST_FILES_DIR, test_vector.filename), @@ -114,7 +117,7 @@ def test_input_from_stdin_without_hints(shared_tmp_dir, test_vector) -> None: text=False, ) - stdout = result.stdout.decode(locale.getpreferredencoding()) + stdout = result.stdout.decode("utf-8") assert ( result.returncode == 0 ), f"CLI exited with error: {result.stderr.decode('utf-8')}" @@ -135,12 +138,12 @@ def test_convert_url(shared_tmp_dir, test_vector): time.sleep(1) # Ensure we don't hit rate limits result = subprocess.run( - ["python", "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename], + [sys.executable, "-m", "markitdown", TEST_FILES_URL + "/" + test_vector.filename], capture_output=True, text=False, ) - stdout = result.stdout.decode(locale.getpreferredencoding()) + stdout = result.stdout.decode("utf-8") assert result.returncode == 0, f"CLI exited with error: {result.stderr}" for test_string in test_vector.must_include: assert test_string in stdout @@ -155,7 +158,7 @@ def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: output_file = os.path.join(shared_tmp_dir, test_vector.filename + ".output") result = subprocess.run( [ - "python", + sys.executable, "-m", "markitdown", "--keep-data-uris", @@ -165,12 +168,13 @@ def test_output_to_file_with_data_uris(shared_tmp_dir, test_vector) -> None: ], capture_output=True, text=True, + encoding="utf-8", ) assert result.returncode == 0, f"CLI exited with error: {result.stderr}" assert os.path.exists(output_file), f"Output file not created: {output_file}" - with open(output_file, "r") as f: + with open(output_file, "r", encoding="utf-8") as f: output_data = f.read() for test_string in test_vector.must_include: assert test_string in output_data diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..2cc94ea27 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -221,35 +221,38 @@ def test_data_uris() -> None: def test_file_uris() -> None: + from urllib.request import url2pathname + expected_path = os.path.abspath(url2pathname("/path/to/file.txt")) + # Test file URI with an empty host file_uri = "file:///path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with no host file_uri = "file:/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with localhost file_uri = "file://localhost/path/to/file.txt" netloc, path = file_uri_to_path(file_uri) assert netloc == "localhost" - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with query parameters file_uri = "file:///path/to/file.txt?param=value" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path # Test file URI with fragment file_uri = "file:///path/to/file.txt#fragment" netloc, path = file_uri_to_path(file_uri) assert netloc is None - assert path == "/path/to/file.txt" + assert path == expected_path def test_docx_comments() -> None: