translate-codelists/translate.py at master · codeforIATI/translate-codelists · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import click
from lxml import etree
import xlrd, xlwt, mmap
from xml.dom import minidom

nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}

def XLSDictReader(f, sheet_index=0):
    book    = xlrd.open_workbook(file_contents=mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ))
    sheet   = book.sheet_by_index(sheet_index)
    headers = dict( (i, sheet.cell_value(0, i) ) for i in range(sheet.ncols) )
    return ( dict( (headers[j], sheet.cell_value(i, j)) for j in headers ) for i in range(1, sheet.nrows) )


def get_text(element):
    if element is None: return ""
    return element.text


# Thanks to https://infix.se/2007/02/06/gentlemen-indent-your-xml
def indent(elem, level=0):
    """Ensures the file is still indented with 4 spaces, as all
    the existing codelist files are."""
    i = "\n" + level*"    "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "    "
        for e in elem:
            indent(e, level+1)
            if not e.tail or not e.tail.strip():
                e.tail = i + "    "
        if not e.tail or not e.tail.strip():
            e.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i


@click.group()
def main():
   pass


@main.command()
@click.option('--existing_codelist_filename', help="The filename of the existing XML codelists file, e.g. 'Sector.xml'.")
@click.option('--output_filename', help="The output filename of the new XML codelists file incorporating the translations.")
@click.option('--new_translation_filename', help="The filename of the Excel translation of the existing codelists file.")
@click.option('--lang', help="The language of the translations file in lowercase, e.g. 'fr' for French.")
def merge_translations(existing_codelist_filename, output_filename, new_translation_filename, lang):
    """Merge translations from a provided Excel translations
    file into an existing XML file."""
    parser = etree.XMLParser(remove_blank_text=True)
    codelist_xml_file = open(existing_codelist_filename)
    codelist_xml = etree.parse(codelist_xml_file, parser)

    translated_file = open(new_translation_filename, "r")
    codes = XLSDictReader(translated_file)

    for one_code in codes:
        the_code = codelist_xml.xpath("/codelist/codelist-items/codelist-item[code/text()='{}']".format(one_code["code"]))[0]

        if one_code["name"]:
            new_name = the_code.find('name/narrative[@xml:lang="{}"]'.format(lang), namespaces=nsmap)
            if new_name is None:
                new_name = etree.SubElement(the_code.find('name'), "narrative")
                new_name.set('{http://www.w3.org/XML/1998/namespace}lang', lang)
            new_name.text = one_code["name"]

        if one_code["description"]:
            new_description = the_code.find('description/narrative[@xml:lang="{}"]'.format(lang), namespaces=nsmap)
            if new_description is None:
                new_description = etree.SubElement(the_code.find('description'), "narrative")
                new_description.set('{http://www.w3.org/XML/1998/namespace}lang', lang)
            new_description.text = one_code["description"]

    indent(codelist_xml.getroot())
    outf = open(output_filename, 'w')
    outf.write("{}\n".format(etree.tostring(codelist_xml, encoding="unicode")))
    outf.close()


@main.command()
@click.option('--existing_codelist_filename', help="The filename of the existing XML codelists file, e.g. 'Sector.xml'.")
@click.option('--output_filename', help="The output filename of the Excel translations into the desired language.")
@click.option('--lang', help="The language of the translations file in lowercase, e.g. 'fr' for French.")
def generate_translations(existing_codelist_filename, output_filename, lang):
    """Generate an Excel translations file from an existing
    XML file"""
    parser = etree.XMLParser(remove_blank_text=True)
    codelist_xml_file = open(existing_codelist_filename)
    codelist_xml = etree.parse(codelist_xml_file, parser)

    wb = xlwt.Workbook()
    sheet = wb.add_sheet('Sheet 1')
    sheet.write(0,0,'code')
    sheet.write(0,1,'name')
    sheet.write(0,2,'description')
    for i, code in enumerate(codelist_xml.xpath("/codelist/codelist-items/codelist-item")):
        sheet.write(i+1, 0, get_text(code.find('code')))
        sheet.write(i+1, 1, get_text(code.find('name/narrative[@xml:lang="{}"]'.format(lang), namespaces=nsmap)))
        sheet.write(i+1, 2, get_text(code.find('description/narrative[@xml:lang="{}"]'.format(lang), namespaces=nsmap)))
    wb.save(output_filename, "utf-8")


if __name__ == '__main__':
    main()