70b analysis doc
This commit is contained in:
parent
81dc8bdcbe
commit
cddbbf844d
@ -78,13 +78,19 @@ The tool currently tests models on these coding challenges:
|
|||||||
3. Initial syntax validation is performed
|
3. Initial syntax validation is performed
|
||||||
4. Code that fails validation is passed to Together API for advanced code analysis
|
4. Code that fails validation is passed to Together API for advanced code analysis
|
||||||
5. Code that passes validation is executed and validated with given data and compared to expected results
|
5. Code that passes validation is executed and validated with given data and compared to expected results
|
||||||
6. Each test is run 4 times for consistency and only the last 3 results are used for metrics
|
|
||||||
|
|
||||||
### Test Validation
|
### Test Validation
|
||||||
For each test case:
|
For each test case:
|
||||||
- Input values are provided to the function
|
- Input values are provided to the function
|
||||||
- Output is compared with expected results
|
- Output is compared with expected results
|
||||||
- Test results are marked as ✅ (pass) or ❌ (fail)
|
- Test results are marked as ✅ (pass) or ❌ (fail)
|
||||||
|
- The last 3 results of 4are used for metrics
|
||||||
|
- An online 70b model (via free Together API) : meta-llama/Llama-3.3-70B-Instruct-Turbo-Free is used to give structured explanations for failed code
|
||||||
|
|
||||||
|
### Metrics
|
||||||
|
- Success Rate: Percentage of successful test cases
|
||||||
|
- Tokens per Second (tk/sec): Number of tokens processed per second
|
||||||
|
- Response Time: Time taken to generate a response
|
||||||
|
|
||||||
Example test cases:
|
Example test cases:
|
||||||
```plaintext
|
```plaintext
|
||||||
|
8
main.py
8
main.py
@ -111,8 +111,12 @@ Model: {model}"""
|
|||||||
|
|
||||||
analysis = response.choices[0].message.content
|
analysis = response.choices[0].message.content
|
||||||
should_pass = "SHOULD PASS" in analysis.upper()
|
should_pass = "SHOULD PASS" in analysis.upper()
|
||||||
if verbose: print(f"\n{BLUE}[{model}] Together Analysis:{ENDC}")
|
if verbose:
|
||||||
if verbose: print(f"{GREEN if should_pass else RED}{analysis}{ENDC}")
|
print(f"\n{BLUE}{'='*50}{ENDC}")
|
||||||
|
print(f"{BLUE}[{model}] TOGETHER API CODE ANALYSIS:{ENDC}")
|
||||||
|
print(f"{BLUE}{'='*50}{ENDC}")
|
||||||
|
print(f"{GREEN if should_pass else RED}{analysis}{ENDC}")
|
||||||
|
print(f"{BLUE}{'='*50}{ENDC}")
|
||||||
return should_pass
|
return should_pass
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"\n{RED}Error getting Together API analysis: {e}{ENDC}")
|
print(f"\n{RED}Error getting Together API analysis: {e}{ENDC}")
|
||||||
|
Loading…
Reference in New Issue
Block a user