-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsync_config_from_gf.py
More file actions
143 lines (118 loc) · 5.13 KB
/
sync_config_from_gf.py
File metadata and controls
143 lines (118 loc) · 5.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# sync_config_from_gf.py
import os
import json
import pgf
import sys
# -----------------------------------------------------------------------------
# CONFIGURATION
# -----------------------------------------------------------------------------
PGF_PATH = os.path.join("gf", "Wiki.pgf")
PROFILES_PATH = os.path.join("language_profiles", "profiles.json")
# Map your Schema concepts to GF Abstract Functions
# (Adjust these keys to match exactly what is defined in your .gf files)
CONCEPT_MAP = {
"copula": ["lex_copula_V", "be_V", "is_V"], # Will try these in order
}
# Map ISO codes (Architect) to Concrete Syntax names (GF)
# If your GF names follow "WikiEng", "WikiFre", this helper tries to guess them.
# Add manual exceptions here.
ISO_TO_GF_MANUAL = {
"zho": "WikiChi", # Example: ISO is zho, GF might use Chi/Zho
"fas": "WikiPes", # Persian often Pes in standard libraries
}
def load_json(path):
if not os.path.exists(path):
return {}
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def save_json(path, data):
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def get_linearization(concrete, function_name):
"""
Safely extract the string linearization of a single lexical function.
"""
try:
# We assume the function has no arguments (lexical item)
# In PGF, we construct a tree of just that function
expr = pgf.readExpr(function_name)
return concrete.linearize(expr)
except Exception:
return None
def find_copula(concrete):
"""
Tries multiple function names to find the copula.
"""
for fn in CONCEPT_MAP["copula"]:
if concrete.hasFunctions(fn): # Hypothetical helper, or just try/except
res = get_linearization(concrete, fn)
if res:
return res
return None
# -----------------------------------------------------------------------------
# MAIN LOGIC
# -----------------------------------------------------------------------------
def run():
print(f"🚀 Syncing Language Cards from GF: {PGF_PATH}")
if not os.path.exists(PGF_PATH):
print(f"❌ PGF file missing. Please build your grammar first.")
sys.exit(1)
grammar = pgf.readPGF(PGF_PATH)
profiles = load_json(PROFILES_PATH)
if not profiles:
print("❌ No profiles found in language_profiles/profiles.json")
sys.exit(1)
updated_count = 0
for iso_code, profile in profiles.items():
# 1. Determine Target File Path
family = profile.get("family", "analytic") # Default folder
target_dir = os.path.join("data", family)
target_file = os.path.join(target_dir, f"{iso_code}.json")
# Ensure dir exists
os.makedirs(target_dir, exist_ok=True)
# Load existing config or start fresh
config = load_json(target_file)
# 2. Find GF Concrete Grammar
# Try "Wiki" + TitleCase(iso) or lookups
gf_lang_name = ISO_TO_GF_MANUAL.get(iso_code)
if not gf_lang_name:
# Fallback heuristic: eng -> WikiEng, fra -> WikiFre
# You might need to adjust logic based on your actual .gf names
candidates = [
f"Wiki{iso_code.capitalize()}", # WikiEng
f"Wiki{iso_code.upper()}", # WikiENG
f"Food{iso_code.capitalize()}" # If using Food grammar example
]
for cand in candidates:
if cand in grammar.languages:
gf_lang_name = cand
break
if not gf_lang_name or gf_lang_name not in grammar.languages:
print(f"⚠️ [{iso_code}] No matching GF concrete grammar found. Skipping GF sync.")
# We still save the file if it didn't exist, to prevent 'File Not Found' errors
if not os.path.exists(target_file):
save_json(target_file, config)
continue
concrete = grammar.languages[gf_lang_name]
print(f" [{iso_code}] Found Grammar: {gf_lang_name}")
# 3. Extract Data from GF
# --- COPULA ---
# Note: In RGL, "be_V" often linearizes to infinite form.
# For simple engines, we might want the present tense string.
# This simple extractor gets the default linearization.
copula_text = find_copula(concrete)
if copula_text:
config["copula"] = {"lemma": copula_text, "source": "gf_extraction"}
print(f" - Copula: {copula_text}")
# --- ARTICLES (Advanced) ---
# This is harder to extract generically without knowing the Grammar's inflection tables.
# But we can try to linearize specific trees if your grammar defines them,
# e.g., "mkNP (DetSg Masc) ..."
# For now, we leave articles to be manually filled or populated by populate_data.py
# to avoid overwriting them with bad data.
# 4. Save
save_json(target_file, config)
updated_count += 1
print(f"\n✅ Sync Complete. Updated {updated_count} language cards.")
if __name__ == "__main__":
run()