forked from microsoft/AIOpsLab
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqualitative.py
101 lines (78 loc) · 3.08 KB
/
qualitative.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Helper functions for qualitative evaluation of solutions."""
import os
import re
import ast
from openai import OpenAI
from aiopslab.session import SessionItem
from aiopslab.utils.cache import LLMCache
from aiopslab.orchestrator.evaluators.prompts import SCORER_PROMPTS
class LLMJudge:
"""A LLM as a judge that evaluates the quality of a solution."""
def __init__(self, trace: list[SessionItem]):
self.trace = trace
self.llm = GPT4Turbo()
self.prompt = None
self._format_trace()
def reasoning_score(self) -> bool:
"""Generate a 1-10 score based on the agent's response to a task"""
self.prompt = SCORER_PROMPTS
self.prompt["user"] = self.prompt["user"].format(trace=self.trace)
judgement = self.llm.inference(self._get_payload())[0]
score = self._parse_score(judgement)
return score, judgement
# helper functions
def _get_payload(self):
"""Prepare the payload for the LLM."""
payload = []
for role, content in self.prompt.items():
payload.append({"role": role, "content": content})
return payload
def _format_trace(self):
"""Format the trace for the LLM."""
item2str = lambda item: f"###{item.role}:\n{item.content}\n\n"
self.trace = "".join([item2str(item) for item in self.trace])
def _parse_score(self, judgement: str) -> int:
"""Parse the score from the judgement."""
one_score_pattern = re.compile(r"\[\[(\d+\.?\d*)\]\]")
one_score_pattern_backup = re.compile(r"\[(\d+\.?\d*)\]")
match = re.search(one_score_pattern, judgement)
if not match:
match = re.search(one_score_pattern_backup, judgement)
if match:
score = ast.literal_eval(match.groups()[0])
else:
score = -1
return score
class GPT4Turbo:
"""An abstraction of the GPT-4 Turbo model (default judge)."""
def __init__(self):
self.cache = LLMCache()
def inference(self, payload: list[dict[str, str]]) -> list[str]:
if self.cache is not None:
cache_result = self.cache.get_from_cache(payload)
if cache_result is not None:
return cache_result
client = OpenAI(api_key=os.getenv("OPENAI_KEY"))
try:
response = client.chat.completions.create(
messages=payload, # type: ignore
model="gpt-4-turbo-2024-04-09",
max_tokens=1024,
temperature=0.0,
top_p=0.95,
frequency_penalty=0.0,
presence_penalty=0.0,
n=1,
timeout=60,
stop=[],
)
except Exception as e:
print(f"Exception: {repr(e)}")
raise e
response = [c.message.content for c in response.choices] # type: ignore
if self.cache is not None:
self.cache.add_to_cache(payload, response)
self.cache.save_cache()
return response