import numpy as np import asyncio from typing importList, Dict from pydantic import BaseModel, Field from langchain_openai import ChatOpenAI from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import PydanticOutputParser
# ============== Parser ==============
# 单个指标的结果 classMetric_Score(BaseModel): metric_name: str = Field(description="This is the name of the metric currently evaluted. ") reasoning: str = Field(description="This is the reasoning process of evaluating the triplet according to the metric. ") score: float = Field(description="This is a decisive score based on the reasoning process above. ")
# 多个指标合并为一个类 classMulti_Metric_Score(BaseModel): result: List[Metric_Score] = Field(description="A list of results.") output_parser = PydanticOutputParser(pydantic_object=Multi_Metric_Score) wrap_output_parser = ThinkContentParser(output_parser)
# ============== Prompt ==============
metric2description = { "accuracy": "Ensure that the output is logically correct and factually accurate, especially for questions with clear answers, such as knowledge-based Q&A.", "effectiveness": "Evaluate whether the content of the answer meets the user's needs, not just superficial relevance.", "readability": "Ensure that the answer is easy for the user to read and understand, avoiding excessive jargon or complex sentence structures.", "relevance": "Evaluate whether the model accurately understands the question and maintains contextual coherence in the answer." }
metric_str = '\n'.join([f'- {metric}:{description}'for metric, description in metric2description.items()])
rating_prompt = ( "You are a helpful and precise assistant for checking the quality of the data." "You will be given a triplet, including the instruction, the optional input and the output. " "Note that the input will be blank if the instruction needs no input to generate the output. \n" "Please rate the quality of the triplet according to the following standards:\n" "\n{metric_str}\n" "You will rate on a scale of 0 to {score_scale}, where a higher score indicates higher level of the accuracy. \n" "Output format: {format}\n\n" "[START of instruction]:\n{instruction}\n[END of instruction]\n\n" "[START of input]:\n{input}\n[END of input]\n\n" "[START of output]:\n{output}\n[END of output]\n\n" "Present the response in {language} language. \n" )