LIFT Utils documentation

LIFT Utils is a Python library for manipulating linguistic lexicon data in the XML-based LIFT format.

Get basic lexicon details:

>>> from lift_utils import Lexicon
>>> sg_lex = Lexicon("~/lift/sango LIFT export/sango LIFT export.lift")
>>> print(len(sg_lex.entry_items))
3479

Get details about entries and their senses:

>>> print(sg_lex.entry_items[0])
kêtê ngû sô asua pë…    kêtê ngû sô asua pëpe_000d9e27-d103-4601-a4b1-6c3ee5ef4c01
>>> print(sg_lex.entry_items[0].sense_items[0])
pool                    Noun            27bb9896-e413-4c14-8cdc-cbe26ecbc148
>>> sg_lex.show()  # list all lexicon entries, sorted by lexical-unit
# ...
zo wa (sg)              zo wa_be18403e-c45d-4fb9-9d5a-35d26bf0b883
zôâ (sg)                zôâ_d10bd14f-eed6-42c9-bbfb-e0f22c5acd8f
zonga (sg)              zonga_0c9f35f3-3a3f-4660-aa2b-55a701cf2c68
zonyön (sg)             zonyön_8fbfa01d-c5e8-4a4f-b848-013f601de4d4
zöröndö (sg)            zöröndö_5d26b3de-26cb-42d2-86ee-33e018ec88c7
zovokö (sg)             zovokö_3e959588-8541-411d-aba5-4a00d39d66ba
zovukö (sg)             zovukö_cc8ad26d-3d79-4249-9c80-1af9fd2185c6
zozo (sg)               zozo_72b4fcfe-3112-4f92-aee7-248d708b6bd4
zû (sg)                 zû_ed13b0f7-24b4-4233-a6ae-a1e6f0060416
zû na sêse (sg)         zû na sêse_8e32a3f1-1db7-4fdc-a0f1-5c5ccb7c7ca8
zûâ (sg)                zûâ_115eccf6-19af-439f-b134-07ae1b11b51f
zûku (sg)               zûku_51a7b1be-cb53-40c7-8b0e-fde1ab9cc3f0
zûku li (sg)            zûku li_1a258482-1d61-44f4-b7a9-24f4bad575cc
zûsuka (sg)             zûsuka_8c5e481c-8811-482e-bff0-126282c829ab
zûu (sg)                zûu_137d7f37-ceff-464a-9d7f-00febbcfd439
>>> item = sg_lex.get_item_by_id('zo wa_be18403e-c45d-4fb9-9d5a-35d26bf0b883')
>>> type(item)
<class 'lift_utils.lexicon.Entry'>
>>> item.show()
xml_tag: entry
date_created: 2017-02-25T11:03:30Z
date_modified: 2022-05-07T14:15:56Z
field_items: None
trait_items: [<lift_utils.base.Trait object at 0x7fd777f4cbb0>]
annotation_items: None
id: zo wa_be18403e-c45d-4fb9-9d5a-35d26bf0b883
guid: be18403e-c45d-4fb9-9d5a-35d26bf0b883
order: None
date_deleted: None
lexical_unit: zo wa (sg)
citation: None
pronunciation_items: None
variant_items: None
sense_items: [<lift_utils.lexicon.Sense object at 0x7fd777f48fd0>]
note_items: None
relation_items: [<lift_utils.lexicon.Relation object at 0x7fd777f4c130>]
etymology_items: None

Search the lexicon:

>>> sg_lex.find('house')  # 'find' returns first item whose glosses contain the search term
<lift_utils.lexicon.Sense object at 0x7f3689d9e5b0>
>>> len(sg_lex.find_all('house'))  # 'find_all' returns a list of all matching items
13
>>> len(sg_lex.find_all('Noun', field='grammatical-info'))  # search other fields as well
2357

Edit the lexicon:

>>> new_entry = sg_lex.add_entry()
>>> new_entry.show()
date_created: 2024-06-14T12:00:47Z
date_modified: None
field_items: None
trait_items: None
annotation_items: None
id: bb11d621-8b09-4c2f-b572-2d2f3e90f9e1
guid: None
order: None
date_deleted: None
lexical_unit: None
citation: None
pronunciation_items: None
variant_items: None
sense_items: None
note_items: None
relation_items: None
etymology_items: None
>>> new_entry.set_lexical_unit({'sg': 'fîni yê'})
>>> new_entry.show()
date_created: 2024-06-14T12:00:47Z
date_modified: 2024-06-14T12:08:04Z
field_items: None
trait_items: None
annotation_items: None
id: bb11d621-8b09-4c2f-b572-2d2f3e90f9e1
guid: None
order: None
date_deleted: None
lexical_unit: fîni yê (sg)  # <-- updated lexical_unit
citation: None
pronunciation_items: None
variant_items: None
sense_items: None
note_items: None
relation_items: None
etymology_items: None
>>> new_sense = new_entry.add_sense()
>>> new_sense.show()
date_created: 2024-06-14T14:32:36Z
date_modified: 2024-06-14T14:33:51Z
field_items: None
trait_items: None
annotation_items: None
id: 0dc5ccdb-01c3-4af0-879f-73c7fd79bb35
order: None
grammatical_info: None
gloss_items: None
definition: None
relation_items: None
note_items: None
example_items: None
reversal_items: None
illustration_items: None
subsense_items: None
>>> for e in sg_lex.get_range_elements('grammatical-info'):
...     print(e)
...
Adverb
Noun
# ...
>>> new_sense.set_grammatical_info('Noun')
>>> new_sense.show()
date_created: 2024-06-14T14:32:36Z
date_modified: 2024-06-14T14:33:51Z
field_items: None
trait_items: None
annotation_items: None
id: 0dc5ccdb-01c3-4af0-879f-73c7fd79bb35
order: None
grammatical_info: Noun  # <-- updated grammatical_info
gloss_items: None
definition: None
relation_items: None
note_items: None
example_items: None
reversal_items: None
illustration_items: None
subsense_items: None

Compare data across multiple lexicons:

>>> import multiprocessing as mp
>>> from tabulate import tabulate  # pip install tabulate
>>> cawls = [f"{n:04d}" for n in range(1, 1701)]  # CAWL numbers, 0001 to 1700
>>> def lus_from_lift(lift, cawls):  # prepare multiprocess function
...     lex = Lexicon(lift)
...     lexical_units = []
...     for cawl in cawls:
...         lexical_unit = None
...         item = lex.find(cawl, field='CAWL', match_type='exact')
...         if item and item.id:
...             parent = item.parent_item
...             if parent:
...                 lexical_unit = parent.lexical_unit
...         lexical_units.append(lexical_unit)
...     return lexical_units
...
>>> lifts = [
...     '~/lift/Bhogoto FLEx LIFT export/Bhogoto FLEx LIFT export.lift',
...     '~/lift/Gbagiri FLEx LIFT export/FLEx LIFT export.lift',
...     '~/lift/Gbanu FLEx LIFT export/FLEx LIFT export.lift',
... ]
>>> with mp.Pool(3) as p:  # use multiprocessing to handle 3 files at once
...     lus_by_lex = p.starmap(lus_from_lift, ((lift, cawls) for lift in lifts))
...
>>> table = zip(cawls, *lus_by_lex)  # convert "columns" to "rows" for table
>>> print(tabulate(table))  # shows 4 columns: CAWL#, Bhogoto, Gbagiri, Gbanu
----  -----------------------------------------  ---------------  ---------------------------
0001  tɛɛ (bdt-CF)                                                tɛ (gbv)
0002  ndara (bdt-CF) (3 forms)                   ndara (gbv)      ndara te wire (gbv)
0003  zu (bdt-CF) (3 forms)                      zu (gbv)         zu (gbv)
0004  baŋge (bdt-CF)                                              ngɔri (gbv)
0005  dɔɔti ri (bdt-CF)                          ri (gbv)         ri (gbv)
# ...
1696  demɔ (bdt-CF) (3 forms)
1697  mɔ nɛ de nɛ (bdt-CF)                                        dɛ ɗãa tom (gbv)
1698  eyɛ (bdt-CF)
1699  ɛ̧ɛ (bdt-CF)
1700  hoo hoo (bdt-CF)
----  -----------------------------------------  ---------------  ---------------------------

Roadmap

Release

Feature

v0.1

read support for LIFT files

v0.2

write support for LIFT files

v0.3

add helper methods to facilitate lexicon searching and manipulation

Indexes