diff --git a/tests/test_test_perplexity_paths.py b/tests/test_test_perplexity_paths.py
new file mode 100644
index 000000000..45e9e7851
--- /dev/null
+++ b/tests/test_test_perplexity_paths.py
@@ -0,0 +1,83 @@
+import importlib.util
+import tempfile
+import unittest
+from pathlib import Path
+
+
+MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "test_perplexity.py"
+
+
+def load_test_perplexity_module():
+    spec = importlib.util.spec_from_file_location(
+        "bitnet_test_perplexity_under_test", MODULE_PATH
+    )
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Unable to load {MODULE_PATH}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+class TestPerplexityPathTests(unittest.TestCase):
+    def test_resolve_default_path_uses_script_dir_for_built_in_defaults(self):
+        module = load_test_perplexity_module()
+
+        resolved = module.resolve_default_path(
+            "../build/bin/llama-perplexity", "../build/bin/llama-perplexity"
+        )
+
+        self.assertEqual(
+            resolved, (MODULE_PATH.parent / "../build/bin/llama-perplexity").resolve()
+        )
+
+    def test_resolve_default_path_keeps_custom_paths(self):
+        module = load_test_perplexity_module()
+
+        self.assertEqual(
+            module.resolve_default_path(
+                "/tmp/custom-bin", "../build/bin/llama-perplexity"
+            ),
+            Path("/tmp/custom-bin"),
+        )
+        self.assertEqual(
+            module.resolve_default_path(
+                "models/custom.gguf", "../build/bin/llama-perplexity"
+            ),
+            Path("models/custom.gguf"),
+        )
+
+    def test_perplexity_tester_uses_resolved_default_paths(self):
+        module = load_test_perplexity_module()
+
+        with tempfile.TemporaryDirectory() as tmp:
+            model_path = Path(tmp) / "model.gguf"
+            model_path.write_text("stub")
+
+            expected_bin = (
+                MODULE_PATH.parent / "../build/bin/llama-perplexity"
+            ).resolve()
+            expected_quant = (
+                MODULE_PATH.parent / "../build/bin/llama-quantize"
+            ).resolve()
+            expected_data = (MODULE_PATH.parent / "../data").resolve()
+
+            original_exists = module.Path.exists
+
+            def fake_exists(self):
+                if self in {expected_bin, expected_quant, expected_data, model_path}:
+                    return True
+                return original_exists(self)
+
+            module.Path.exists = fake_exists
+            try:
+                tester = module.PerplexityTester(str(model_path))
+            finally:
+                module.Path.exists = original_exists
+
+            self.assertEqual(tester.llama_perplexity_bin, expected_bin)
+            self.assertEqual(tester.quantize_bin, expected_quant)
+            self.assertEqual(tester.data_dir, expected_data)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/utils/test_perplexity.py b/utils/test_perplexity.py
index f2d9788c0..cfa723672 100644
--- a/utils/test_perplexity.py
+++ b/utils/test_perplexity.py
@@ -17,14 +17,37 @@
 import statistics
 
 
+SCRIPT_DIR = Path(__file__).resolve().parent
+
+
+def resolve_default_path(raw_path, default_relative):
+    path = Path(raw_path)
+    default_path = Path(default_relative)
+    if path == default_path and not path.is_absolute():
+        return (SCRIPT_DIR / default_path).resolve()
+    return path
+
+
 class PerplexityTester:
-    def __init__(self, model_path, llama_perplexity_bin="../build/bin/llama-perplexity", 
-                 data_dir="../data", output_dir="perplexity_results", quick_mode=False,
-                 quantize_bin="../build/bin/llama-quantize", test_embeddings=False, csv_output=None):
+    def __init__(
+        self,
+        model_path,
+        llama_perplexity_bin="../build/bin/llama-perplexity",
+        data_dir="../data",
+        output_dir="perplexity_results",
+        quick_mode=False,
+        quantize_bin="../build/bin/llama-quantize",
+        test_embeddings=False,
+        csv_output=None,
+    ):
         self.model_path = Path(model_path)
-        self.llama_perplexity_bin = Path(llama_perplexity_bin)
-        self.quantize_bin = Path(quantize_bin)
-        self.data_dir = Path(data_dir)
+        self.llama_perplexity_bin = resolve_default_path(
+            llama_perplexity_bin, "../build/bin/llama-perplexity"
+        )
+        self.quantize_bin = resolve_default_path(
+            quantize_bin, "../build/bin/llama-quantize"
+        )
+        self.data_dir = resolve_default_path(data_dir, "../data")
         self.output_dir = Path(output_dir)
         self.quick_mode = quick_mode
         self.test_embeddings = test_embeddings
@@ -32,69 +55,77 @@ def __init__(self, model_path, llama_perplexity_bin="../build/bin/llama-perplexi
         self.results = []
         self.created_models = set()  # Track newly created model files
         self.temp_files = []  # Track temporary files for cleanup
-        
+
         # Embedding types to test
         self.embedding_types = [
-            ('F32', 'f32'),
-            ('F16', 'f16'),
-            ('Q8_0', 'q8_0'),
-            ('Q6_K', 'q6_k'),
-            ('Q5_0', 'q5_0'),
-            ('Q4_0', 'q4_0'),
-            ('Q3_K', 'q3_k'),
-            ('TQ2_0', 'tq2_0'),
+            ("F32", "f32"),
+            ("F16", "f16"),
+            ("Q8_0", "q8_0"),
+            ("Q6_K", "q6_k"),
+            ("Q5_0", "q5_0"),
+            ("Q4_0", "q4_0"),
+            ("Q3_K", "q3_k"),
+            ("TQ2_0", "tq2_0"),
         ]
-        
+
         # Create output directory
         self.output_dir.mkdir(parents=True, exist_ok=True)
-        
+
         # Verify llama-perplexity binary exists
         if not self.llama_perplexity_bin.exists():
-            raise FileNotFoundError(f"llama-perplexity binary not found: {self.llama_perplexity_bin}")
-        
+            raise FileNotFoundError(
+                f"llama-perplexity binary not found: {self.llama_perplexity_bin}"
+            )
+
         # Verify quantize binary exists if testing embeddings
         if self.test_embeddings and not self.quantize_bin.exists():
-            raise FileNotFoundError(f"llama-quantize binary not found: {self.quantize_bin}")
-        
+            raise FileNotFoundError(
+                f"llama-quantize binary not found: {self.quantize_bin}"
+            )
+
         # Verify model file exists
         if not self.model_path.exists():
             raise FileNotFoundError(f"Model file not found: {self.model_path}")
-    
+
     def find_datasets(self):
         """Find all test.txt files in dataset directories."""
         datasets = []
-        
+
         if not self.data_dir.exists():
             print(f"❌ Data directory not found: {self.data_dir}")
             return datasets
-        
+
         print(f"\n🔍 Searching for datasets in {self.data_dir}...")
-        
+
         # Look for test.txt files in subdirectories
         for dataset_dir in sorted(self.data_dir.iterdir()):
             if dataset_dir.is_dir():
                 test_file = dataset_dir / "test.txt"
                 if test_file.exists():
                     size_mb = test_file.stat().st_size / (1024 * 1024)
-                    datasets.append({
-                        'name': dataset_dir.name,
-                        'path': test_file,
-                        'size': test_file.stat().st_size,
-                        'size_mb': size_mb
-                    })
+                    datasets.append(
+                        {
+                            "name": dataset_dir.name,
+                            "path": test_file,
+                            "size": test_file.stat().st_size,
+                            "size_mb": size_mb,
+                        }
+                    )
                     print(f"   ✅ {dataset_dir.name:<20} ({size_mb:.2f} MB)")
                 else:
                     print(f"   ⚠️  {dataset_dir.name:<20} (no test.txt found)")
-        
+
         return datasets
-    
+
     def create_quick_dataset(self, dataset_path, num_chars=4096):
         """Create a temporary dataset with only the first N characters for quick testing."""
-        temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8')
+        temp_file = tempfile.NamedTemporaryFile(
+            mode="w", delete=False, suffix=".txt", encoding="utf-8"
+        )
         self.temp_files.append(temp_file.name)
-        
+
         try:
-            with open(dataset_path, 'r', encoding='utf-8', errors='ignore') as f:
+            with open(dataset_path, "r", encoding="utf-8", errors="ignore") as f:
                 content = f.read(num_chars)
                 temp_file.write(content)
             temp_file.close()
@@ -103,7 +134,7 @@ def create_quick_dataset(self, dataset_path, num_chars=4096):
             print(f"⚠️  Failed to create quick dataset: {e}")
             temp_file.close()
             return dataset_path
-    
+
     def cleanup_temp_files(self):
         """Clean up temporary files."""
         for temp_file in self.temp_files:
@@ -112,50 +143,57 @@ def cleanup_temp_files(self):
             except:
                 pass
         self.temp_files = []
-    
-    def run_perplexity_test(self, dataset_name, dataset_path, threads=16, ctx_size=512, model_override=None):
+
+    def run_perplexity_test(
+        self, dataset_name, dataset_path, threads=16, ctx_size=512, model_override=None
+    ):
         """Run perplexity test on a single dataset."""
         test_model = model_override if model_override else self.model_path
-        
-        print(f"\n{'='*80}")
+
+        print(f"\n{'=' * 80}")
         print(f"📊 Testing on dataset: {dataset_name}")
         print(f"   File: {dataset_path}")
         print(f"   Model: {test_model.name}")
-        print(f"{'='*80}")
-        
+        print(f"{'=' * 80}")
+
         cmd = [
             str(self.llama_perplexity_bin),
-            "-m", str(test_model),
-            "-f", str(dataset_path),
-            "-t", str(threads),
-            "-c", str(ctx_size),
-            "-ngl", "0"  # CPU only
+            "-m",
+            str(test_model),
+            "-f",
+            str(dataset_path),
+            "-t",
+            str(threads),
+            "-c",
+            str(ctx_size),
+            "-ngl",
+            "0",  # CPU only
         ]
-        
+
         print(f"💻 Command: {' '.join(cmd)}")
         print(f"⏱️  Starting test...\n")
-        
+
         start_time = time.time()
-        
+
         try:
             result = subprocess.run(
                 cmd,
                 capture_output=True,
                 text=True,
                 timeout=3600,  # 1 hour timeout
-                cwd=os.getcwd()
+                cwd=os.getcwd(),
             )
-            
+
             elapsed_time = time.time() - start_time
-            
+
             if result.returncode == 0:
                 # Parse perplexity from output (check both stdout and stderr)
                 combined_output = result.stdout + "\n" + result.stderr
                 ppl = self.parse_perplexity(combined_output)
-                
+
                 if ppl is not None:
                     print(f"\n✅ Perplexity: {ppl}")
-                    print(f"⏱️  Time: {elapsed_time:.2f}s ({elapsed_time/60:.2f} min)")
+                    print(f"⏱️  Time: {elapsed_time:.2f}s ({elapsed_time / 60:.2f} min)")
                     status = "success"
                 else:
                     print(f"\n⚠️  Test completed but could not parse perplexity")
@@ -171,43 +209,43 @@ def run_perplexity_test(self, dataset_name, dataset_path, threads=16, ctx_size=5
                 status = "failed"
                 ppl = None
                 elapsed_time = time.time() - start_time
-            
+
             return {
-                'dataset': dataset_name,
-                'perplexity': ppl,
-                'time': elapsed_time,
-                'status': status,
-                'stdout': result.stdout,
-                'stderr': result.stderr
+                "dataset": dataset_name,
+                "perplexity": ppl,
+                "time": elapsed_time,
+                "status": status,
+                "stdout": result.stdout,
+                "stderr": result.stderr,
             }
-            
+
         except subprocess.TimeoutExpired:
             elapsed_time = time.time() - start_time
             print(f"\n❌ Timeout after {elapsed_time:.2f}s")
             return {
-                'dataset': dataset_name,
-                'perplexity': None,
-                'time': elapsed_time,
-                'status': 'timeout',
-                'stdout': '',
-                'stderr': 'Test exceeded 1 hour timeout'
+                "dataset": dataset_name,
+                "perplexity": None,
+                "time": elapsed_time,
+                "status": "timeout",
+                "stdout": "",
+                "stderr": "Test exceeded 1 hour timeout",
             }
         except Exception as e:
             elapsed_time = time.time() - start_time
             print(f"\n❌ Error: {e}")
             return {
-                'dataset': dataset_name,
-                'perplexity': None,
-                'time': elapsed_time,
-                'status': 'error',
-                'stdout': '',
-                'stderr': str(e)
+                "dataset": dataset_name,
+                "perplexity": None,
+                "time": elapsed_time,
+                "status": "error",
+                "stdout": "",
+                "stderr": str(e),
             }
-    
+
     def parse_perplexity(self, output):
         """Parse perplexity value (mean±std format) from llama-perplexity output."""
         # First try to match "PPL = mean +/- std" format
-        pattern_with_std = r'PPL\s*=\s*(\d+\.?\d*)\s*\+/-\s*(\d+\.?\d*)'
+        pattern_with_std = r"PPL\s*=\s*(\d+\.?\d*)\s*\+/-\s*(\d+\.?\d*)"
         match = re.search(pattern_with_std, output, re.IGNORECASE | re.MULTILINE)
         if match:
             try:
@@ -216,18 +254,18 @@ def parse_perplexity(self, output):
                 return f"{mean:.4f}±{std:.4f}"
             except ValueError:
                 pass
-        
+
         # Fallback to patterns without std
         patterns = [
-            r'Final estimate:\s*PPL\s*=\s*(\d+\.?\d*)',
-            r'Final perplexity:\s*(\d+\.?\d*)',
-            r'PPL\s*=\s*(\d+\.?\d*)',
-            r'PPL:\s*(\d+\.?\d*)',
-            r'perplexity:\s*(\d+\.?\d*)',
-            r'ppl\s*=\s*(\d+\.?\d*)',
-            r'Perplexity:\s*(\d+\.?\d*)',
+            r"Final estimate:\s*PPL\s*=\s*(\d+\.?\d*)",
+            r"Final perplexity:\s*(\d+\.?\d*)",
+            r"PPL\s*=\s*(\d+\.?\d*)",
+            r"PPL:\s*(\d+\.?\d*)",
+            r"perplexity:\s*(\d+\.?\d*)",
+            r"ppl\s*=\s*(\d+\.?\d*)",
+            r"Perplexity:\s*(\d+\.?\d*)",
         ]
-        
+
         for pattern in patterns:
             match = re.search(pattern, output, re.IGNORECASE | re.MULTILINE)
             if match:
@@ -235,67 +273,68 @@ def parse_perplexity(self, output):
                     return f"{float(match.group(1)):.4f}"
                 except ValueError:
                     continue
-        
+
         return None
-    
+
     def quantize_embedding(self, embedding_type, output_suffix):
         """
         Quantize model with specific embedding type.
-        
+
         Args:
             embedding_type: Token embedding type (uppercase, e.g., 'Q6_K')
             output_suffix: Output file suffix (lowercase, e.g., 'q6_k')
-        
+
         Returns:
             Path to quantized model or None if failed
         """
         # Construct output path
         model_dir = self.model_path.parent
         output_path = model_dir / f"ggml-model-i2_s-embed-{output_suffix}.gguf"
-        
+
         # Check if file already exists
         file_existed = output_path.exists()
-        
+
         if file_existed:
             print(f"ℹ️  Model already exists: {output_path.name}")
             return output_path
-        
+
         cmd = [
             str(self.quantize_bin),
-            "--token-embedding-type", embedding_type,
+            "--token-embedding-type",
+            embedding_type,
             str(self.model_path),
             str(output_path),
             "I2_S",
             "1",
-            "1"
+            "1",
         ]
-        
-        print(f"\n{'='*80}")
+
+        print(f"\n{'=' * 80}")
         print(f"🔄 Quantizing with embedding type: {embedding_type}")
         print(f"📥 Input:  {self.model_path.name}")
         print(f"📤 Output: {output_path.name}")
         print(f"💻 Command: {' '.join(cmd)}")
-        print(f"{'='*80}\n")
-        
+        print(f"{'=' * 80}\n")
+
         start_time = time.time()
-        
+
         try:
             result = subprocess.run(
                 cmd,
                 capture_output=True,
                 text=True,
                 cwd=os.getcwd(),
-                timeout=600  # 10 minutes timeout
+                timeout=600,  # 10 minutes timeout
             )
-            
+
             duration = time.time() - start_time
-            
+
             if result.returncode == 0:
                 file_size_mb = output_path.stat().st_size / (1024 * 1024)
                 print(f"✅ Quantization successful!")
                 print(f"   Duration: {duration:.2f}s")
                 print(f"   Size: {file_size_mb:.2f} MB")
-                
+
                 # Mark as newly created
                 self.created_models.add(output_path)
                 return output_path
@@ -303,14 +342,14 @@ def quantize_embedding(self, embedding_type, output_suffix):
                 print(f"❌ Quantization failed with return code {result.returncode}")
                 print(f"Error: {result.stderr[:500]}")
                 return None
-                
+
         except subprocess.TimeoutExpired:
             print(f"❌ Quantization timeout (exceeded 10 minutes)")
             return None
         except Exception as e:
             print(f"❌ Quantization error: {e}")
             return None
-    
+
     def cleanup_model(self, model_path):
         """Delete model file if it was created during this session."""
         if model_path in self.created_models:
@@ -322,127 +361,136 @@ def cleanup_model(self, model_path):
                 print(f"⚠️  Failed to delete {model_path.name}: {e}")
         else:
             print(f"ℹ️  Keeping existing file: {model_path.name}")
-    
+
     def run_all_tests(self, threads=16, ctx_size=512):
         """Run perplexity tests on all datasets."""
         datasets = self.find_datasets()
-        
+
         if not datasets:
             print(f"\n❌ No datasets found in {self.data_dir}")
             print(f"   Make sure each dataset directory has a test.txt file")
             return
-        
+
         # Quick mode: test all datasets but only first 4096 chars with smaller context
         if self.quick_mode:
             ctx_size = min(ctx_size, 128)  # Use smaller context in quick mode
             print(f"\n⚡ QUICK TEST MODE ENABLED")
             print(f"   - Testing all datasets with first 4096 characters only")
             print(f"   - Using reduced context size: {ctx_size}")
-        
+
         # Determine models to test
         if self.test_embeddings:
-            print(f"\n{'='*80}")
+            print(f"\n{'=' * 80}")
             print(f"🧪 EMBEDDING QUANTIZATION TEST MODE")
-            print(f"{'='*80}")
+            print(f"{'=' * 80}")
             print(f"📦 Base model: {self.model_path.name}")
             print(f"🔢 Embedding types to test: {len(self.embedding_types)}")
             print(f"📊 Datasets: {len(datasets)}")
             print(f"🧵 Threads: {threads}")
             print(f"📏 Context size: {ctx_size}")
-            print(f"{'='*80}")
-            
+            print(f"{'=' * 80}")
+
             total_start = time.time()
-            
+
             # Test each embedding type
-            for i, (embedding_type, output_suffix) in enumerate(self.embedding_types, 1):
-                print(f"\n\n{'#'*80}")
-                print(f"[{i}/{len(self.embedding_types)}] Testing embedding type: {output_suffix} ({embedding_type})")
-                print(f"{'#'*80}")
-                
+            for i, (embedding_type, output_suffix) in enumerate(
+                self.embedding_types, 1
+            ):
+                print(f"\n\n{'#' * 80}")
+                print(
+                    f"[{i}/{len(self.embedding_types)}] Testing embedding type: {output_suffix} ({embedding_type})"
+                )
+                print(f"{'#' * 80}")
+
                 # Quantize model
                 quantized_model = self.quantize_embedding(embedding_type, output_suffix)
-                
+
                 if quantized_model is None:
-                    print(f"⚠️  Skipping tests for {output_suffix} due to quantization failure")
+                    print(
+                        f"⚠️  Skipping tests for {output_suffix} due to quantization failure"
+                    )
                     continue
-                
+
                 # Test on all datasets
                 for j, dataset in enumerate(datasets, 1):
-                    print(f"\n[{j}/{len(datasets)}] Testing {dataset['name']} with {output_suffix}...")
-                    
+                    print(
+                        f"\n[{j}/{len(datasets)}] Testing {dataset['name']} with {output_suffix}..."
+                    )
+
                     # Use quick dataset if in quick mode
-                    test_path = dataset['path']
+                    test_path = dataset["path"]
                     if self.quick_mode:
-                        test_path = self.create_quick_dataset(dataset['path'])
-                    
+                        test_path = self.create_quick_dataset(dataset["path"])
+
                     result = self.run_perplexity_test(
                         f"{dataset['name']}_embed-{output_suffix}",
                         test_path,
                         threads,
                         ctx_size,
-                        model_override=quantized_model
+                        model_override=quantized_model,
                     )
                     self.results.append(result)
-                
+
                 # Cleanup model after testing
                 print(f"\n🧹 Cleaning up {output_suffix} model...")
                 self.cleanup_model(quantized_model)
-                
-                print(f"\n{'#'*80}")
+
+                print(f"\n{'#' * 80}")
                 print(f"✅ Completed {output_suffix}")
-                print(f"{'#'*80}")
-            
+                print(f"{'#' * 80}")
+
             total_time = time.time() - total_start
-            
+
         else:
             # Regular single model test
-            print(f"\n{'='*80}")
-            print(f"🚀 PERPLEXITY TEST SESSION{' (QUICK MODE)' if self.quick_mode else ''}")
-            print(f"{'='*80}")
+            print(f"\n{'=' * 80}")
+            print(
+                f"🚀 PERPLEXITY TEST SESSION{' (QUICK MODE)' if self.quick_mode else ''}"
+            )
+            print(f"{'=' * 80}")
             print(f"📦 Model: {self.model_path.name}")
             print(f"📁 Model path: {self.model_path}")
-            print(f"📊 Datasets {'to test' if self.quick_mode else 'found'}: {len(datasets)}")
+            print(
+                f"📊 Datasets {'to test' if self.quick_mode else 'found'}: {len(datasets)}"
+            )
             print(f"🧵 Threads: {threads}")
             print(f"📏 Context size: {ctx_size}")
-            print(f"{'='*80}")
-            
+            print(f"{'=' * 80}")
+
             total_start = time.time()
-            
+
             # Run tests
             for i, dataset in enumerate(datasets, 1):
                 print(f"\n\n[{i}/{len(datasets)}] Processing {dataset['name']}...")
-                
+
                 # Use quick dataset if in quick mode
-                test_path = dataset['path']
+                test_path = dataset["path"]
                 if self.quick_mode:
-                    test_path = self.create_quick_dataset(dataset['path'])
-                
+                    test_path = self.create_quick_dataset(dataset["path"])
+
                 result = self.run_perplexity_test(
-                    dataset['name'],
-                    test_path,
-                    threads,
-                    ctx_size
+                    dataset["name"], test_path, threads, ctx_size
                 )
                 self.results.append(result)
-            
+
             total_time = time.time() - total_start
-        
+
         # Clean up temporary files
         if self.quick_mode:
             print(f"\n🧹 Cleaning up temporary files...")
             self.cleanup_temp_files()
-        
+
         # Save results
         self.save_results()
-        
+
         # Print summary
         self.print_summary(total_time)
-    
+
     def save_results(self):
         """Save results to CSV file."""
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         model_name = self.model_path.stem
-        
+
         # Use custom CSV path if provided
         if self.csv_output:
             csv_file = self.csv_output
@@ -450,128 +498,157 @@ def save_results(self):
             csv_file.parent.mkdir(parents=True, exist_ok=True)
         else:
             csv_file = self.output_dir / f"ppl_{model_name}_{timestamp}.csv"
-        
+
         print(f"\n💾 Saving results...")
-        
-        with open(csv_file, 'w', newline='') as f:
-            writer = csv.DictWriter(f, fieldnames=['dataset', 'perplexity', 'time_seconds', 'status'])
+
+        with open(csv_file, "w", newline="") as f:
+            writer = csv.DictWriter(
+                f, fieldnames=["dataset", "perplexity", "time_seconds", "status"]
+            )
             writer.writeheader()
             for result in self.results:
-                writer.writerow({
-                    'dataset': result['dataset'],
-                    'perplexity': result['perplexity'] if result['perplexity'] is not None else 'N/A',
-                    'time_seconds': f"{result['time']:.2f}",
-                    'status': result['status']
-                })
-        
+                writer.writerow(
+                    {
+                        "dataset": result["dataset"],
+                        "perplexity": result["perplexity"]
+                        if result["perplexity"] is not None
+                        else "N/A",
+                        "time_seconds": f"{result['time']:.2f}",
+                        "status": result["status"],
+                    }
+                )
+
         print(f"   ✅ CSV saved: {csv_file}")
-        
+
         # Save detailed log
         log_file = self.output_dir / f"ppl_{model_name}_{timestamp}.log"
-        with open(log_file, 'w') as f:
+        with open(log_file, "w") as f:
             f.write(f"Perplexity Test Results\n")
-            f.write(f"{'='*80}\n")
+            f.write(f"{'=' * 80}\n")
             f.write(f"Model: {self.model_path}\n")
             f.write(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
-            f.write(f"{'='*80}\n\n")
-            
+            f.write(f"{'=' * 80}\n\n")
+
             for result in self.results:
-                f.write(f"\n{'='*80}\n")
+                f.write(f"\n{'=' * 80}\n")
                 f.write(f"Dataset: {result['dataset']}\n")
                 f.write(f"Perplexity: {result['perplexity']}\n")
                 f.write(f"Time: {result['time']:.2f}s\n")
                 f.write(f"Status: {result['status']}\n")
                 f.write(f"\nOutput:\n{result['stdout']}\n")
-                if result['stderr']:
+                if result["stderr"]:
                     f.write(f"\nErrors:\n{result['stderr']}\n")
-        
+
         print(f"   ✅ Log saved: {log_file}")
-    
+
     def print_summary(self, total_time):
         """Print summary of all tests."""
-        print(f"\n\n{'='*80}")
+        print(f"\n\n{'=' * 80}")
         print(f"📊 TEST SUMMARY")
-        print(f"{'='*80}\n")
-        
+        print(f"{'=' * 80}\n")
+
         # Sort results by perplexity (lower is better)
-        successful = [r for r in self.results if r['perplexity'] is not None]
-        failed = [r for r in self.results if r['perplexity'] is None]
-        
+        successful = [r for r in self.results if r["perplexity"] is not None]
+        failed = [r for r in self.results if r["perplexity"] is None]
+
         if successful:
             # Extract numeric value from "mean±std" format for sorting
             def get_ppl_value(result):
-                ppl = result['perplexity']
-                if isinstance(ppl, str) and '±' in ppl:
-                    return float(ppl.split('±')[0])
+                ppl = result["perplexity"]
+                if isinstance(ppl, str) and "±" in ppl:
+                    return float(ppl.split("±")[0])
                 elif isinstance(ppl, str):
                     try:
                         return float(ppl)
                     except ValueError:
-                        return float('inf')
+                        return float("inf")
                 return ppl
-            
+
             successful_sorted = sorted(successful, key=get_ppl_value)
-            
+
             print(f"{'Dataset':<20} {'Perplexity':>20} {'Time (s)':>12} {'Status':<15}")
-            print(f"{'-'*80}")
-            
+            print(f"{'-' * 80}")
+
             for result in successful_sorted:
-                ppl_str = str(result['perplexity']) if result['perplexity'] is not None else 'N/A'
-                print(f"{result['dataset']:<20} {ppl_str:>20} "
-                      f"{result['time']:>12.2f} {result['status']:<15}")
-            
-            best_ppl = str(successful_sorted[0]['perplexity'])
-            print(f"\n🏆 Best result: {successful_sorted[0]['dataset']} "
-                  f"(PPL: {best_ppl})")
-        
+                ppl_str = (
+                    str(result["perplexity"])
+                    if result["perplexity"] is not None
+                    else "N/A"
+                )
+                print(
+                    f"{result['dataset']:<20} {ppl_str:>20} "
+                    f"{result['time']:>12.2f} {result['status']:<15}"
+                )
+
+            best_ppl = str(successful_sorted[0]["perplexity"])
+            print(
+                f"\n🏆 Best result: {successful_sorted[0]['dataset']} (PPL: {best_ppl})"
+            )
+
         if failed:
             print(f"\n❌ Failed tests ({len(failed)}):")
             for result in failed:
                 print(f"   - {result['dataset']}: {result['status']}")
-        
-        print(f"\n{'='*80}")
+
+        print(f"\n{'=' * 80}")
         print(f"✅ Completed: {len(successful)}/{len(self.results)}")
-        print(f"⏱️  Total time: {total_time:.2f}s ({total_time/60:.2f} min)")
+        print(f"⏱️  Total time: {total_time:.2f}s ({total_time / 60:.2f} min)")
         print(f"📁 Results saved in: {self.output_dir}")
-        print(f"{'='*80}\n")
+        print(f"{'=' * 80}\n")
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Test model perplexity on multiple datasets')
-    parser.add_argument('--model', '-m',
-                        required=True,
-                        help='Path to GGUF model file')
-    parser.add_argument('--data-dir', '-d',
-                        default='data',
-                        help='Directory containing dataset folders (default: data)')
-    parser.add_argument('--threads', '-t',
-                        type=int,
-                        default=16,
-                        help='Number of threads (default: 16)')
-    parser.add_argument('--ctx-size', '-c',
-                        type=int,
-                        default=512,
-                        help='Context size (default: 512)')
-    parser.add_argument('--output-dir', '-o',
-                        default='perplexity_results',
-                        help='Output directory for results (default: perplexity_results)')
-    parser.add_argument('--llama-perplexity',
-                        default='./build/bin/llama-perplexity',
-                        help='Path to llama-perplexity binary (default: ./build/bin/llama-perplexity)')
-    parser.add_argument('--quick', '-q',
-                        action='store_true',
-                        help='Quick test mode: test all datasets with first 4096 characters and reduced context size (128)')
-    parser.add_argument('--test-embeddings', '-e',
-                        action='store_true',
-                        help='Test different embedding quantization types (f32, f16, q8_0, q6_k, q5_0, q4_0, q3_k, tq2_0)')
-    parser.add_argument('--csv-output',
-                        help='Custom path for CSV output file (e.g., results/my_ppl_results.csv)')
-    parser.add_argument('--quantize-bin',
-                        default='./build/bin/llama-quantize',
-                        help='Path to llama-quantize binary (default: ./build/bin/llama-quantize)')
-    
+    parser = argparse.ArgumentParser(
+        description="Test model perplexity on multiple datasets"
+    )
+    parser.add_argument("--model", "-m", required=True, help="Path to GGUF model file")
+    parser.add_argument(
+        "--data-dir",
+        "-d",
+        default="data",
+        help="Directory containing dataset folders (default: data)",
+    )
+    parser.add_argument(
+        "--threads", "-t", type=int, default=16, help="Number of threads (default: 16)"
+    )
+    parser.add_argument(
+        "--ctx-size", "-c", type=int, default=512, help="Context size (default: 512)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        "-o",
+        default="perplexity_results",
+        help="Output directory for results (default: perplexity_results)",
+    )
+    parser.add_argument(
+        "--llama-perplexity",
+        default="./build/bin/llama-perplexity",
+        help="Path to llama-perplexity binary (default: ./build/bin/llama-perplexity)",
+    )
+    parser.add_argument(
+        "--quick",
+        "-q",
+        action="store_true",
+        help="Quick test mode: test all datasets with first 4096 characters and reduced context size (128)",
+    )
+    parser.add_argument(
+        "--test-embeddings",
+        "-e",
+        action="store_true",
+        help="Test different embedding quantization types (f32, f16, q8_0, q6_k, q5_0, q4_0, q3_k, tq2_0)",
+    )
+    parser.add_argument(
+        "--csv-output",
+        help="Custom path for CSV output file (e.g., results/my_ppl_results.csv)",
+    )
+    parser.add_argument(
+        "--quantize-bin",
+        default="./build/bin/llama-quantize",
+        help="Path to llama-quantize binary (default: ./build/bin/llama-quantize)",
+    )
+
     args = parser.parse_args()
-    
+
     try:
         tester = PerplexityTester(
             model_path=args.model,
@@ -581,14 +658,11 @@ def main():
             quick_mode=args.quick,
             quantize_bin=args.quantize_bin,
             test_embeddings=args.test_embeddings,
-            csv_output=args.csv_output
-        )
-        
-        tester.run_all_tests(
-            threads=args.threads,
-            ctx_size=args.ctx_size
+            csv_output=args.csv_output,
         )
-        
+
+        tester.run_all_tests(threads=args.threads, ctx_size=args.ctx_size)
+
     except FileNotFoundError as e:
         print(f"❌ Error: {e}")
         return 1
@@ -598,9 +672,10 @@ def main():
     except Exception as e:
         print(f"\n❌ Unexpected error: {e}")
         import traceback
+
         traceback.print_exc()
         return 1
-    
+
     return 0