99from pathlib import Path
1010from typing import Dict , Any , List , Tuple
1111from datetime import datetime
12-
13- # 按照 4:4:2 的比例定义权重(调整后)
1412DIMENSION_WEIGHTS = {
1513 "consistency" : 0.4 , # 40% (4/10)
1614 "physicality" : 0.4 , # 40% (4/10)
1715 "aesthetic" : 0.2 # 20% (2/10)
1816}
1917
20- # 各子维度在父维度内的权重(等权重,调整后)
2118SUB_DIMENSION_WEIGHTS = {
2219 "consistency" : {
2320 "semantic_consistency" : 0.33 ,
@@ -83,22 +80,18 @@ def load_json(path: str) -> Dict[str, Dict]:
8380def extract_scores (txt : str ) -> Dict [str , float ]:
8481 """Extract scores from evaluation text using comprehensive patterns."""
8582 patterns = [
86- # 一致性维度
8783 (r"\*{0,2}Semantic Consistency\*{0,2}\s*[::]?\s*(\d)" , "semantic_consistency" ),
8884 (r"\*{0,2}Factual Consistency\*{0,2}\s*[::]?\s*(\d)" , "factual_consistency" ),
8985 (r"\*{0,2}Spatial-Temporal Consistency\*{0,2}\s*[::]?\s*(\d)" , "spatial_temporal_consistency" ),
9086
91- # 美学维度(更新)
9287 (r"\*{0,2}Expressiveness\*{0,2}\s*[::]?\s*(\d)" , "expressiveness" ),
9388 (r"\*{0,2}Artistic Quality\*{0,2}\s*[::]?\s*(\d)" , "artistic_quality" ),
9489 (r"\*{0,2}Authenticity\*{0,2}\s*[::]?\s*(\d)" , "authenticity" ),
9590
96- # 物理性维度
9791 (r"\*{0,2}Basic Properties\*{0,2}\s*[::]?\s*(\d)" , "basic_properties" ),
9892 (r"\*{0,2}Dynamics and Interactivity\*{0,2}\s*[::]?\s*(\d)" , "dynamics_interactivity" ),
9993 (r"\*{0,2}Physical Reliability\*{0,2}\s*[::]?\s*(\d)" , "physical_reliability" ),
10094
101- # 宽松匹配模式(更新)
10295 (r"(?i)(Semantic Consistency|Factual Consistency|Spatial-Temporal Consistency|Expressiveness|Artistic Quality|Authenticity|Basic Properties|Dynamics and Interactivity|Physical Reliability)\s*[::]?\s*(\d)" , "flexible_match" )
10396 ]
10497
@@ -152,12 +145,11 @@ def find_image_paths(index: str, image_dir: str, steps: List[int]) -> Dict[int,
152145 # Convert index to string and handle zero-padding
153146 index_str = str (index )
154147
155- # 尝试多种可能的目录结构
156148 possible_dirs = [
157149 os .path .join (image_dir , f"index_{ index_str .zfill (4 )} " ),
158150 os .path .join (image_dir , f"index_{ index_str } " ),
159151 os .path .join (image_dir , index_str ),
160- image_dir # 直接在当前目录查找
152+ image_dir
161153 ]
162154
163155 target_dir = None
@@ -171,7 +163,6 @@ def find_image_paths(index: str, image_dir: str, steps: List[int]) -> Dict[int,
171163 return image_paths
172164
173165 for step in steps :
174- # 尝试多种可能的文件名模式
175166 possible_filenames = [
176167 f"index_{ index_str .zfill (4 )} _step_{ step } .png" ,
177168 f"index_{ index_str } _step_{ step } .png" ,
@@ -194,7 +185,6 @@ def find_image_paths(index: str, image_dir: str, steps: List[int]) -> Dict[int,
194185 return image_paths
195186
196187def get_grade (score : float ) -> str :
197- """根据分数返回等级"""
198188 if score >= 4.5 :
199189 return "Excellent"
200190 elif score >= 4.0 :
@@ -209,9 +199,6 @@ def get_grade(score: float) -> str:
209199 return "Very Poor"
210200
211201def calculate_comprehensive_scores (individual_scores : Dict ) -> Dict :
212- """计算综合评分 - 按照4:4:2权重"""
213-
214- # 提取各维度分数
215202 consistency_scores = {
216203 "semantic_consistency" : individual_scores .get ("semantic_consistency" , 0 ),
217204 "factual_consistency" : individual_scores .get ("factual_consistency" , 0 ),
@@ -230,7 +217,6 @@ def calculate_comprehensive_scores(individual_scores: Dict) -> Dict:
230217 "physical_reliability" : individual_scores .get ("physical_reliability" , 0 )
231218 }
232219
233- # 计算维度平均分(加权)
234220 consistency_avg = sum (
235221 consistency_scores [dim ] * SUB_DIMENSION_WEIGHTS ["consistency" ][dim ]
236222 for dim in consistency_scores
@@ -246,42 +232,35 @@ def calculate_comprehensive_scores(individual_scores: Dict) -> Dict:
246232 for dim in physicality_scores
247233 )
248234
249- # 计算总体分数(按照4:4:2权重)
250235 overall_score = (
251236 consistency_avg * DIMENSION_WEIGHTS ["consistency" ] +
252237 physicality_avg * DIMENSION_WEIGHTS ["physicality" ] +
253238 aesthetic_avg * DIMENSION_WEIGHTS ["aesthetic" ]
254239 )
255240
256241 return {
257- # 原始分数
258242 ** individual_scores ,
259243
260- # 维度平均分
261244 "consistency_score" : round (consistency_avg , 2 ),
262245 "aesthetic_score" : round (aesthetic_avg , 2 ),
263246 "physicality_score" : round (physicality_avg , 2 ),
264247 "overall_score" : round (overall_score , 2 ),
265248
266- # 权重信息
267249 "weight_info" : {
268250 "consistency_weight" : DIMENSION_WEIGHTS ["consistency" ],
269251 "physicality_weight" : DIMENSION_WEIGHTS ["physicality" ],
270252 "aesthetic_weight" : DIMENSION_WEIGHTS ["aesthetic" ],
271253 "total_weight" : sum (DIMENSION_WEIGHTS .values ())
272254 },
273255
274- # 简单平均分(不加权,用于对比)
275256 "consistency_avg_simple" : round (sum (consistency_scores .values ()) / len (consistency_scores ), 2 ),
276257 "aesthetic_avg_simple" : round (sum (aesthetic_scores .values ()) / len (aesthetic_scores ), 2 ),
277258 "physicality_avg_simple" : round (sum (physicality_scores .values ()) / len (physicality_scores ), 2 ),
278259 "overall_avg_simple" : round (sum (individual_scores .values ()) / len (individual_scores ), 2 ),
279260
280- # 通过率统计
281261 "pass_rate_3" : round (sum (1 for score in individual_scores .values () if score >= 3 ) / len (individual_scores ), 2 ),
282262 "pass_rate_4" : round (sum (1 for score in individual_scores .values () if score >= 4 ) / len (individual_scores ), 2 ),
283263
284- # 等级评定
285264 "overall_grade" : get_grade (overall_score ),
286265 "consistency_grade" : get_grade (consistency_avg ),
287266 "aesthetic_grade" : get_grade (aesthetic_avg ),
@@ -305,7 +284,7 @@ def build_sequence_evaluation_messages(sequence_data: Dict, image_base64_list: L
305284 # Build image content with proper formatting
306285 image_contents = []
307286 for i , image_base64 in enumerate (image_base64_list ):
308- if image_base64 : # 只添加成功编码的图像
287+ if image_base64 :
309288 image_contents .append ({
310289 "type" : "image_url" ,
311290 "image_url" : {
@@ -531,13 +510,11 @@ def evaluate_sequence(index: str, sequence_data: Dict, cfg: Dict) -> Tuple[Dict,
531510 "individual_scores" : scores ,
532511 "comprehensive_scores" : comprehensive_scores
533512 },
534- { # score record (简化版,用于分析)
513+ {
535514 "index" : index ,
536515 "category" : sequence_data ["category" ],
537516 "process_type" : sequence_data ["process_type" ],
538- # 原始分数
539517 ** scores ,
540- # 综合分数
541518 "consistency_score" : comprehensive_scores ["consistency_score" ],
542519 "aesthetic_score" : comprehensive_scores ["aesthetic_score" ],
543520 "physicality_score" : comprehensive_scores ["physicality_score" ],
@@ -554,15 +531,13 @@ def evaluate_sequence(index: str, sequence_data: Dict, cfg: Dict) -> Tuple[Dict,
554531 return None
555532
556533def calculate_std (scores : List [float ]) -> float :
557- """计算标准差"""
558534 if len (scores ) <= 1 :
559535 return 0.0
560536 mean = sum (scores ) / len (scores )
561537 variance = sum ((x - mean ) ** 2 for x in scores ) / (len (scores ) - 1 )
562538 return math .sqrt (variance )
563539
564540def analyze_comprehensive_results (all_scores : List [Dict ]) -> Dict :
565- """分析综合评分结果"""
566541
567542 if not all_scores :
568543 return {}
@@ -573,14 +548,12 @@ def analyze_comprehensive_results(all_scores: List[Dict]) -> Dict:
573548 "ranking" : {},
574549 "summary" : {}
575550 }
576-
577- # 收集所有综合分数
551+
578552 consistency_scores = [s .get ("consistency_score" , 0 ) for s in all_scores ]
579553 aesthetic_scores = [s .get ("aesthetic_score" , 0 ) for s in all_scores ]
580554 physicality_scores = [s .get ("physicality_score" , 0 ) for s in all_scores ]
581555 overall_scores = [s .get ("overall_score" , 0 ) for s in all_scores ]
582556
583- # 维度性能分析
584557 for dim_name , scores in [
585558 ("consistency" , consistency_scores ),
586559 ("aesthetic" , aesthetic_scores ),
@@ -594,7 +567,6 @@ def analyze_comprehensive_results(all_scores: List[Dict]) -> Dict:
594567 "std" : round (calculate_std (scores ), 2 )
595568 }
596569
597- # 按总体分数排序
598570 sorted_indices = sorted (
599571 [(i , s ["overall_score" ]) for i , s in enumerate (all_scores )],
600572 key = lambda x : x [1 ],
@@ -606,7 +578,6 @@ def analyze_comprehensive_results(all_scores: List[Dict]) -> Dict:
606578 "bottom_5" : [{"index" : all_scores [i ]["index" ], "score" : all_scores [i ]["overall_score" ]} for i , _ in sorted_indices [- 5 :]]
607579 }
608580
609- # 总体统计
610581 analysis ["summary" ] = {
611582 "total_sequences" : len (all_scores ),
612583 "weight_ratio" : "Consistency:Physicality:Aesthetic = 4:4:2" ,
@@ -704,7 +675,6 @@ def main():
704675 save_results (full_sorted , cfg ["result_files" ]["full" ], cfg )
705676 save_results (score_sorted , cfg ["result_files" ]["scores" ], cfg )
706677
707- # 生成分析报告
708678 if score_sorted :
709679 analysis = analyze_comprehensive_results (score_sorted )
710680 analysis_path = os .path .join (cfg ["output_dir" ], "analysis_report.json" )
@@ -716,7 +686,6 @@ def main():
716686 }, f , ensure_ascii = False , indent = 2 )
717687 print (f"Analysis report saved to: { analysis_path } " )
718688
719- # 打印简要报告
720689 print ("\n === EVALUATION SUMMARY ===" )
721690 print (f"Total sequences evaluated: { analysis ['summary' ]['total_sequences' ]} " )
722691 print (f"Overall average score: { analysis ['summary' ]['average_overall_score' ]} " )
@@ -729,4 +698,4 @@ def main():
729698 print (f"Evaluation completed. Total sequences: { len (full_sorted )} " )
730699
731700if __name__ == "__main__" :
732- main ()
701+ main ()
0 commit comments