diff --git a/.DS_Store b/.DS_Store index 96bed27..8fbf2d4 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/README.md b/README.md index e436279..78e6e91 100644 --- a/README.md +++ b/README.md @@ -78,13 +78,19 @@ The tool currently tests models on these coding challenges: 3. Initial syntax validation is performed 4. Code that fails validation is passed to Together API for advanced code analysis 5. Code that passes validation is executed and validated with given data and compared to expected results -6. Each test is run 4 times for consistency and only the last 3 results are used for metrics ### Test Validation For each test case: - Input values are provided to the function - Output is compared with expected results - Test results are marked as ✅ (pass) or ❌ (fail) +- The last 3 results of 4are used for metrics +- An online 70b model (via free Together API) : meta-llama/Llama-3.3-70B-Instruct-Turbo-Free is used to give structured explanations for failed code + +### Metrics +- Success Rate: Percentage of successful test cases +- Tokens per Second (tk/sec): Number of tokens processed per second +- Response Time: Time taken to generate a response Example test cases: ```plaintext diff --git a/main.py b/main.py index 1b02210..bc4f7fe 100755 --- a/main.py +++ b/main.py @@ -111,8 +111,12 @@ Model: {model}""" analysis = response.choices[0].message.content should_pass = "SHOULD PASS" in analysis.upper() - if verbose: print(f"\n{BLUE}[{model}] Together Analysis:{ENDC}") - if verbose: print(f"{GREEN if should_pass else RED}{analysis}{ENDC}") + if verbose: + print(f"\n{BLUE}{'='*50}{ENDC}") + print(f"{BLUE}[{model}] TOGETHER API CODE ANALYSIS:{ENDC}") + print(f"{BLUE}{'='*50}{ENDC}") + print(f"{GREEN if should_pass else RED}{analysis}{ENDC}") + print(f"{BLUE}{'='*50}{ENDC}") return should_pass except Exception as e: print(f"\n{RED}Error getting Together API analysis: {e}{ENDC}")