Skip to content

Commit 951c566

Browse files
authored
Update eval.py
1 parent 05be960 commit 951c566

1 file changed

Lines changed: 5 additions & 36 deletions

File tree

eval.py

Lines changed: 5 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,12 @@
99
from pathlib import Path
1010
from typing import Dict, Any, List, Tuple
1111
from datetime import datetime
12-
13-
# 按照 4:4:2 的比例定义权重(调整后)
1412
DIMENSION_WEIGHTS = {
1513
"consistency": 0.4, # 40% (4/10)
1614
"physicality": 0.4, # 40% (4/10)
1715
"aesthetic": 0.2 # 20% (2/10)
1816
}
1917

20-
# 各子维度在父维度内的权重(等权重,调整后)
2118
SUB_DIMENSION_WEIGHTS = {
2219
"consistency": {
2320
"semantic_consistency": 0.33,
@@ -83,22 +80,18 @@ def load_json(path: str) -> Dict[str, Dict]:
8380
def extract_scores(txt: str) -> Dict[str, float]:
8481
"""Extract scores from evaluation text using comprehensive patterns."""
8582
patterns = [
86-
# 一致性维度
8783
(r"\*{0,2}Semantic Consistency\*{0,2}\s*[::]?\s*(\d)", "semantic_consistency"),
8884
(r"\*{0,2}Factual Consistency\*{0,2}\s*[::]?\s*(\d)", "factual_consistency"),
8985
(r"\*{0,2}Spatial-Temporal Consistency\*{0,2}\s*[::]?\s*(\d)", "spatial_temporal_consistency"),
9086

91-
# 美学维度(更新)
9287
(r"\*{0,2}Expressiveness\*{0,2}\s*[::]?\s*(\d)", "expressiveness"),
9388
(r"\*{0,2}Artistic Quality\*{0,2}\s*[::]?\s*(\d)", "artistic_quality"),
9489
(r"\*{0,2}Authenticity\*{0,2}\s*[::]?\s*(\d)", "authenticity"),
9590

96-
# 物理性维度
9791
(r"\*{0,2}Basic Properties\*{0,2}\s*[::]?\s*(\d)", "basic_properties"),
9892
(r"\*{0,2}Dynamics and Interactivity\*{0,2}\s*[::]?\s*(\d)", "dynamics_interactivity"),
9993
(r"\*{0,2}Physical Reliability\*{0,2}\s*[::]?\s*(\d)", "physical_reliability"),
10094

101-
# 宽松匹配模式(更新)
10295
(r"(?i)(Semantic Consistency|Factual Consistency|Spatial-Temporal Consistency|Expressiveness|Artistic Quality|Authenticity|Basic Properties|Dynamics and Interactivity|Physical Reliability)\s*[::]?\s*(\d)", "flexible_match")
10396
]
10497

@@ -152,12 +145,11 @@ def find_image_paths(index: str, image_dir: str, steps: List[int]) -> Dict[int,
152145
# Convert index to string and handle zero-padding
153146
index_str = str(index)
154147

155-
# 尝试多种可能的目录结构
156148
possible_dirs = [
157149
os.path.join(image_dir, f"index_{index_str.zfill(4)}"),
158150
os.path.join(image_dir, f"index_{index_str}"),
159151
os.path.join(image_dir, index_str),
160-
image_dir # 直接在当前目录查找
152+
image_dir
161153
]
162154

163155
target_dir = None
@@ -171,7 +163,6 @@ def find_image_paths(index: str, image_dir: str, steps: List[int]) -> Dict[int,
171163
return image_paths
172164

173165
for step in steps:
174-
# 尝试多种可能的文件名模式
175166
possible_filenames = [
176167
f"index_{index_str.zfill(4)}_step_{step}.png",
177168
f"index_{index_str}_step_{step}.png",
@@ -194,7 +185,6 @@ def find_image_paths(index: str, image_dir: str, steps: List[int]) -> Dict[int,
194185
return image_paths
195186

196187
def get_grade(score: float) -> str:
197-
"""根据分数返回等级"""
198188
if score >= 4.5:
199189
return "Excellent"
200190
elif score >= 4.0:
@@ -209,9 +199,6 @@ def get_grade(score: float) -> str:
209199
return "Very Poor"
210200

211201
def calculate_comprehensive_scores(individual_scores: Dict) -> Dict:
212-
"""计算综合评分 - 按照4:4:2权重"""
213-
214-
# 提取各维度分数
215202
consistency_scores = {
216203
"semantic_consistency": individual_scores.get("semantic_consistency", 0),
217204
"factual_consistency": individual_scores.get("factual_consistency", 0),
@@ -230,7 +217,6 @@ def calculate_comprehensive_scores(individual_scores: Dict) -> Dict:
230217
"physical_reliability": individual_scores.get("physical_reliability", 0)
231218
}
232219

233-
# 计算维度平均分(加权)
234220
consistency_avg = sum(
235221
consistency_scores[dim] * SUB_DIMENSION_WEIGHTS["consistency"][dim]
236222
for dim in consistency_scores
@@ -246,42 +232,35 @@ def calculate_comprehensive_scores(individual_scores: Dict) -> Dict:
246232
for dim in physicality_scores
247233
)
248234

249-
# 计算总体分数(按照4:4:2权重)
250235
overall_score = (
251236
consistency_avg * DIMENSION_WEIGHTS["consistency"] +
252237
physicality_avg * DIMENSION_WEIGHTS["physicality"] +
253238
aesthetic_avg * DIMENSION_WEIGHTS["aesthetic"]
254239
)
255240

256241
return {
257-
# 原始分数
258242
**individual_scores,
259243

260-
# 维度平均分
261244
"consistency_score": round(consistency_avg, 2),
262245
"aesthetic_score": round(aesthetic_avg, 2),
263246
"physicality_score": round(physicality_avg, 2),
264247
"overall_score": round(overall_score, 2),
265248

266-
# 权重信息
267249
"weight_info": {
268250
"consistency_weight": DIMENSION_WEIGHTS["consistency"],
269251
"physicality_weight": DIMENSION_WEIGHTS["physicality"],
270252
"aesthetic_weight": DIMENSION_WEIGHTS["aesthetic"],
271253
"total_weight": sum(DIMENSION_WEIGHTS.values())
272254
},
273255

274-
# 简单平均分(不加权,用于对比)
275256
"consistency_avg_simple": round(sum(consistency_scores.values()) / len(consistency_scores), 2),
276257
"aesthetic_avg_simple": round(sum(aesthetic_scores.values()) / len(aesthetic_scores), 2),
277258
"physicality_avg_simple": round(sum(physicality_scores.values()) / len(physicality_scores), 2),
278259
"overall_avg_simple": round(sum(individual_scores.values()) / len(individual_scores), 2),
279260

280-
# 通过率统计
281261
"pass_rate_3": round(sum(1 for score in individual_scores.values() if score >= 3) / len(individual_scores), 2),
282262
"pass_rate_4": round(sum(1 for score in individual_scores.values() if score >= 4) / len(individual_scores), 2),
283263

284-
# 等级评定
285264
"overall_grade": get_grade(overall_score),
286265
"consistency_grade": get_grade(consistency_avg),
287266
"aesthetic_grade": get_grade(aesthetic_avg),
@@ -305,7 +284,7 @@ def build_sequence_evaluation_messages(sequence_data: Dict, image_base64_list: L
305284
# Build image content with proper formatting
306285
image_contents = []
307286
for i, image_base64 in enumerate(image_base64_list):
308-
if image_base64: # 只添加成功编码的图像
287+
if image_base64:
309288
image_contents.append({
310289
"type": "image_url",
311290
"image_url": {
@@ -531,13 +510,11 @@ def evaluate_sequence(index: str, sequence_data: Dict, cfg: Dict) -> Tuple[Dict,
531510
"individual_scores": scores,
532511
"comprehensive_scores": comprehensive_scores
533512
},
534-
{ # score record (简化版,用于分析)
513+
{
535514
"index": index,
536515
"category": sequence_data["category"],
537516
"process_type": sequence_data["process_type"],
538-
# 原始分数
539517
**scores,
540-
# 综合分数
541518
"consistency_score": comprehensive_scores["consistency_score"],
542519
"aesthetic_score": comprehensive_scores["aesthetic_score"],
543520
"physicality_score": comprehensive_scores["physicality_score"],
@@ -554,15 +531,13 @@ def evaluate_sequence(index: str, sequence_data: Dict, cfg: Dict) -> Tuple[Dict,
554531
return None
555532

556533
def calculate_std(scores: List[float]) -> float:
557-
"""计算标准差"""
558534
if len(scores) <= 1:
559535
return 0.0
560536
mean = sum(scores) / len(scores)
561537
variance = sum((x - mean) ** 2 for x in scores) / (len(scores) - 1)
562538
return math.sqrt(variance)
563539

564540
def analyze_comprehensive_results(all_scores: List[Dict]) -> Dict:
565-
"""分析综合评分结果"""
566541

567542
if not all_scores:
568543
return {}
@@ -573,14 +548,12 @@ def analyze_comprehensive_results(all_scores: List[Dict]) -> Dict:
573548
"ranking": {},
574549
"summary": {}
575550
}
576-
577-
# 收集所有综合分数
551+
578552
consistency_scores = [s.get("consistency_score", 0) for s in all_scores]
579553
aesthetic_scores = [s.get("aesthetic_score", 0) for s in all_scores]
580554
physicality_scores = [s.get("physicality_score", 0) for s in all_scores]
581555
overall_scores = [s.get("overall_score", 0) for s in all_scores]
582556

583-
# 维度性能分析
584557
for dim_name, scores in [
585558
("consistency", consistency_scores),
586559
("aesthetic", aesthetic_scores),
@@ -594,7 +567,6 @@ def analyze_comprehensive_results(all_scores: List[Dict]) -> Dict:
594567
"std": round(calculate_std(scores), 2)
595568
}
596569

597-
# 按总体分数排序
598570
sorted_indices = sorted(
599571
[(i, s["overall_score"]) for i, s in enumerate(all_scores)],
600572
key=lambda x: x[1],
@@ -606,7 +578,6 @@ def analyze_comprehensive_results(all_scores: List[Dict]) -> Dict:
606578
"bottom_5": [{"index": all_scores[i]["index"], "score": all_scores[i]["overall_score"]} for i, _ in sorted_indices[-5:]]
607579
}
608580

609-
# 总体统计
610581
analysis["summary"] = {
611582
"total_sequences": len(all_scores),
612583
"weight_ratio": "Consistency:Physicality:Aesthetic = 4:4:2",
@@ -704,7 +675,6 @@ def main():
704675
save_results(full_sorted, cfg["result_files"]["full"], cfg)
705676
save_results(score_sorted, cfg["result_files"]["scores"], cfg)
706677

707-
# 生成分析报告
708678
if score_sorted:
709679
analysis = analyze_comprehensive_results(score_sorted)
710680
analysis_path = os.path.join(cfg["output_dir"], "analysis_report.json")
@@ -716,7 +686,6 @@ def main():
716686
}, f, ensure_ascii=False, indent=2)
717687
print(f"Analysis report saved to: {analysis_path}")
718688

719-
# 打印简要报告
720689
print("\n=== EVALUATION SUMMARY ===")
721690
print(f"Total sequences evaluated: {analysis['summary']['total_sequences']}")
722691
print(f"Overall average score: {analysis['summary']['average_overall_score']}")
@@ -729,4 +698,4 @@ def main():
729698
print(f"Evaluation completed. Total sequences: {len(full_sorted)}")
730699

731700
if __name__ == "__main__":
732-
main()
701+
main()

0 commit comments

Comments
 (0)