Published

January 16, 2025

Postprocessing raw extracts

using LexiconMining
tranchedirs = filter(d -> startswith(d,"tranche"),readdir(trancheroot))
tranchepaths = map(dir -> joinpath(trancheroot, dir), tranchedirs)
(data,errs) = readdata(tranchepaths)
(Any[(seq = 1, urn = "urn:cite2:hmt:ls.markdown:n0", lemma = "a", definition = "first letter of the latin alphabet", pos = "uninflected", morphology = "uninflected"), (seq = 2, urn = "urn:cite2:hmt:ls.markdown:n1", lemma = "a", definition = "from, by", pos = "preposition", morphology = "ablative"), (seq = 3, urn = "urn:cite2:hmt:ls.markdown:n2", lemma = "ā", definition = "ah, an exclamation", pos = "interjection", morphology = "uninflected"), (seq = 4, urn = "urn:cite2:hmt:ls.markdown:n3", lemma = "ăărōn", definition = "aaron, brother of moses", pos = "noun", morphology = "ăărōn,ăărōnis,m"), (seq = 5, urn = "urn:cite2:hmt:ls.markdown:n4", lemma = "ăb", definition = "from; by", pos = "preposition", morphology = "ablative"), (seq = 6, urn = "urn:cite2:hmt:ls.markdown:n5", lemma = "aba", definition = "see aga", pos = "uninflected", morphology = "uninflected"), (seq = 7, urn = "urn:cite2:hmt:ls.markdown:n6", lemma = "ababus", definition = "false reading in inscriptions", pos = "noun", morphology = "ababus, abavi, masculine"), (seq = 8, urn = "urn:cite2:hmt:ls.markdown:n7", lemma = "ăbactor", definition = "one who drives off, thief", pos = "noun", morphology = "abactor, abactoris, masculine"), (seq = 9, urn = "urn:cite2:hmt:ls.markdown:n8", lemma = "ăbactus", definition = "driven away", pos = "adjective", morphology = "abactus, abacta, abactum"), (seq = 10, urn = "urn:cite2:hmt:ls.markdown:n9", lemma = "ăbactus", definition = "a driving away, robbing", pos = "noun", morphology = "ăbactus, ăbactūs, masculine")  …  (seq = 51587, urn = "urn:cite2:hmt:ls.markdown:n51584", lemma = "zoster", definition = "a promontory, town, and harbor in attica", pos = "noun", morphology = "zoster, zosteris, masculine"), (seq = 51588, urn = "urn:cite2:hmt:ls.markdown:n51585", lemma = "zōthēca", definition = "a small private chamber or recess", pos = "noun", morphology = "zōthēca, zōthēcae, feminine"), (seq = 51589, urn = "urn:cite2:hmt:ls.markdown:n51586", lemma = "zōthēcŭla", definition = "a little closet or cabinet", pos = "noun", morphology = "zōthēcŭla, zōthēcŭlae, feminine"), (seq = 51590, urn = "urn:cite2:hmt:ls.markdown:n51587", lemma = "zura", definition = "the seed of the christ's-thorn", pos = "noun", morphology = "zura, zurae, f"), (seq = 51591, urn = "urn:cite2:hmt:ls.markdown:n51588", lemma = "zŭgĭa", definition = "a tree (horn-beam); a nuptial flute", pos = "noun", morphology = "zŭgĭa, zŭgĭae, f."), (seq = 51592, urn = "urn:cite2:hmt:ls.markdown:n51589", lemma = "zygia", definition = "a name of juno, goddess of marriage", pos = "noun", morphology = "zygia,zygiae,f."), (seq = 51593, urn = "urn:cite2:hmt:ls.markdown:n51590", lemma = "zŭgis", definition = "wild thyme", pos = "noun", morphology = "zŭgis, zŭgidis, feminine"), (seq = 51594, urn = "urn:cite2:hmt:ls.markdown:n51591", lemma = "zŭgostăsĭum", definition = "the office of a weigh-master", pos = "noun", morphology = "zŭgostăsĭum, zŭgostăsĭi, neuter"), (seq = 51595, urn = "urn:cite2:hmt:ls.markdown:n51592", lemma = "zŭgostătes", definition = "a master of the weights, weigh-master", pos = "noun", morphology = "zugostates, zugostatis, masculine"), (seq = 51596, urn = "urn:cite2:hmt:ls.markdown:n51593", lemma = "zythum", definition = "a kind of malt-liquor", pos = "noun", morphology = "zythum, zythi, neuter")], Any["/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n154.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n173.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n298.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n478.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n59.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n605.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n662.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n940.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n974.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche0/n982.cex"  …  "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche8/n8587.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche8/n8604.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche8/n8615.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche8/n8651.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche8/n8681.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche8/n8772.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche9/n9154.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche9/n9203.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche9/n9393.cex", "/Users/nsmith/Desktop/s2025/LexiconMining.jl/suarez/lewisshort-extracts/extracts-cycle2/tranche9/n9931.cex"])

What percentage were parseable?

length(data) / (length(data) + length(errs))
0.988599737376149

Cleaning up syntax

  • globally replace “`”

Normalizing values

posvalues = map(tpl -> tpl.pos, data)
using StatsBase, OrderedCollections
poscounts = sort(OrderedDict(countmap(posvalues)), byvalue=true, rev=true)
OrderedDict{String, Int64} with 153 entries:
  "noun"                     => 25287
  "adjective"                => 10769
  "verb (compound)"          => 7242
  "adverb"                   => 2388
  "participle"               => 1420
  "crossreference"           => 890
  "verb"                     => 549
  "uninflected"              => 504
  "interjection"             => 60
  "pronoun"                  => 54
  "participle and adjective" => 48
  "conjunction"              => 43
  "preposition"              => 40
  "adjective and noun"       => 40
  "adv."                     => 27
  "n/a"                      => 24
  "participle (compound)"    => 21
  "adverb and preposition"   => 17
  ""                         => 14
  ⋮                          => ⋮
println(join(collect(keys(poscounts)),"\n"))
noun
adjective
verb (compound)
adverb
participle
crossreference
verb
uninflected
interjection
pronoun
participle and adjective
conjunction
preposition
adjective and noun
adv.
n/a
participle (compound)
adverb and preposition

adv
adjective, noun
verb (frequentative)
verb (inchoative)
-
adj
cross-reference
numeral
adjective; noun
numeral adverb
v.
verb impersonal
verb (impersonal)
adverb/conjunction
adjective,noun
verb(compound)
verb (defective)
reference
part. and p. a.
part.
verb, (compound)
noun and adjective
adverb/preposition
adjective/noun
numeral adjective
interj.
v
participle, adjective
participle, verb (compound)
—
cardinal numeral
adj.
proper noun
adverb, preposition
cardinal number
verb, deponent (compound)
verb (intransitive)
adv. and prep.
see aegilops
gnĭdus
adjective, noun, adverb
participle and past participle
adv. and conj.
participial adjective
maxilla
participle subst.
quercerus
brāchīle
pronoun demonstrative
de and meio
participle (from figo)
indeclinable noun
dŭ
participle (substantive)
participle (adjective)
turpio
verb (intransitive), (compound)
numeral distributive
verb (neutral pass.)
part./p. a.
part. and adj.
verb (intransitive, compound)
particle
insicium
participle/substantive
bătĕnim
participle (from inflecto)
hirquitallus
part. and p. a. (compound)
"uninflected"
pronoun reflexive
spegma
--
participle and perfect adjective
servaculum
conjunction and adverb
adj., noun
adjective (comparative)
lampsăna
adj., adv., prep.
indecl. noun
participle and pseudo-adjective
referenced
verb (irregular)
baritus
participle (verb-derived)
trĭvĭa
trībŭla
sŭperscendo
prefix
participle (from verb)
i
nītēla
participle and noun
participle (from defungor)
xref
verb, frequentative (compound)
participle and substantive
concessive conditional particle
participle (from flecto)
spintrĭa
distributive numeral
nablia
adjective & noun
ăquīlīcĭum
adv., prep.
root
septĭtĭānus
adverb, conjunction
verb (imperative, compound)
crīmessus
participle, compound
"verb (compound)"
onomatopoeia
personal pronoun
verb, transitive (compound)
adjective (numeral)
conivola
aegyptīni
participle (derived from verb)
adv./prep.
pronoun, adverb
adj. and adv.
crossreference to
`preposition`
adjective and substantive
adjective or noun
pronoun possessive
pŭrum
pronoun (demonstrative)
verb, 3rd conjugation (compound)
participle/adjective
adj./n.
adj, subst

Unify various forms of terminology

E.g., “adv.” -> “adverb”

What percentage had immediately usable values for pos?

sum(collect(values(poscounts))[1:10]) / (length(data) + length(errs))
0.9781345748279018

Remove articles not referring to lexemes


lexical = filter(data) do tpl
tpl.pos != "crossreference" &&
tpl.pos != "participle"
end
lexposvalues = map(lex -> lex.pos, lexical)
47379-element Vector{String}:
 "uninflected"
 "preposition"
 "interjection"
 "noun"
 "preposition"
 "uninflected"
 "noun"
 "noun"
 "adjective"
 "noun"
 ⋮
 "noun"
 "noun"
 "noun"
 "noun"
 "noun"
 "noun"
 "noun"
 "noun"
 "noun"
lexposcounts = sort(OrderedDict(countmap(lexposvalues)), byvalue=true, rev=true)
OrderedDict{String, Int64} with 151 entries:
  "noun"                     => 25287
  "adjective"                => 10769
  "verb (compound)"          => 7242
  "adverb"                   => 2388
  "verb"                     => 549
  "uninflected"              => 504
  "interjection"             => 60
  "pronoun"                  => 54
  "participle and adjective" => 48
  "conjunction"              => 43
  "preposition"              => 40
  "adjective and noun"       => 40
  "adv."                     => 27
  "n/a"                      => 24
  "participle (compound)"    => 21
  "adverb and preposition"   => 17
  ""                         => 14
  "adv"                      => 12
  "adjective, noun"          => 12
  ⋮                          => ⋮

How many of these have good values?


lexcounts = collect(values(lexposcounts))

sum(lexcounts[1:8]) / sum(lexcounts)
0.9888980349944068