Module:syllables

Revision as of 04:06, 22 August 2023 by TheNightAvl (talk | contribs)

Documentation for this module may be created at Module:syllables/documentation

local export = {}
local getArgs = require('Module:Arguments').getArgs

-- DATA --

local grapheme = {
	["a"] = true, ["á"] = true, ["à"] = true, ["â"] = true, ["ả"] = true, ["ai"] = true, ["âi"] = true, ["ao"] = true, ["ào"] = true, ["aoi"] = true, ["au"] = true, ["b"] = true, ["c"] = true, ["d"] = true, ["ð"] = true, ["dx"] = true, ["dz"] = true, ["e"] = true, ["é"] = true, ["è"] = true, ["ea"] = true, ["ei"] = true, ["f"] = true, ["g"] = true, ["h"] = true, ["ħ"] = true, ["i"] = true, ["í"] = true, ["ỉ"] = true, ["ie"] = true, ["ìe"] = true, ["ỉe"] = true, ["iea"] = true, ["j"] = true, ["ĵ"] = true, ["k"] = true, ["ķ"] = true, ["l"] = true, ["m"] = true, ["n"] = true, ["ņ"] = true, ["o"] = true, ["ó"] = true, ["ò"] = true, ["ø"] = true, ["oa"] = true, ["øa"] = true, ["øi"] = true, ["p"] = true, ["q"] = true, ["r"] = true, ["s"] = true, ["ș"] = true, ["t"] = true, ["ts"] = true, ["tș"] = true, ["u"] = true, ["ú"] = true, ["ù"] = true, ["û"] = true, ["ū"] = true, ["ủ"] = true, ["ui"] = true, ["uo"] = true, ["ủo"] = true, ["uoa"] = true, ["v"] = true, ["w"] = true, ["x"] = true, ["y"] = true, ["ỳ"] = true, ["ý"] = true, ["ỷ"] = true, ["ye"] = true, ["ỷe"] = true, ["yea"] = true, ["z"] = true, ["þ"] = true,
	
	["gj"] = true, ["lj"] = true,
}

local vowel = {
	["a"] = true, ["á"] = true, ["à"] = true, ["â"] = true, ["ả"] = true, ["ai"] = true, ["âi"] = true, ["ao"] = true, ["ào"] = true, ["aoi"] = true, ["au"] = true, ["e"] = true, ["é"] = true, ["è"] = true, ["ea"] = true, ["ei"] = true, ["i"] = true, ["í"] = true, ["ỉ"] = true, ["ie"] = true, ["ìe"] = true, ["ỉe"] = true, ["iea"] = true, ["o"] = true, ["ó"] = true, ["ò"] = true, ["ø"] = true, ["oa"] = true, ["øa"] = true, ["øi"] = true, ["u"] = true, ["ú"] = true, ["ù"] = true, ["û"] = true, ["ū"] = true, ["ủ"] = true, ["ui"] = true, ["uo"] = true, ["ủo"] = true, ["uoa"] = true, ["y"] = true, ["ỳ"] = true, ["ý"] = true, ["ỷ"] = true, ["ye"] = true, ["ỷe"] = true, ["yea"] = true,
}

local consonant = {
	["b"] = true, ["c"] = true, ["d"] = true, ["ð"] = true, ["dx"] = true, ["dz"] = true, ["f"] = true, ["g"] = true, ["h"] = true, ["ħ"] = true, ["j"] = true, ["ĵ"] = true, ["k"] = true, ["ķ"] = true, ["l"] = true, ["m"] = true, ["n"] = true, ["ņ"] = true, ["p"] = true, ["q"] = true, ["r"] = true, ["s"] = true, ["ș"] = true, ["t"] = true, ["ts"] = true, ["tș"] = true, ["v"] = true, ["w"] = true, ["x"] = true, ["z"] = true, ["þ"] = true,
	
	["gj"] = true, ["lj"] = true,
}

local affix = {
	["·"] = true, ["-"] = true, ["’"] = true, ["‘"] = true,
}

local nasal = {
	["m"] = true, ["n"] = true, ["ņ"] = true, 
}
		
local glide = {
	["j"] = true, ["ĵ"] = true, ["w"] = true,
	
	["gj"] = true, ["lj"] = true,
}

local Cv_fixed = { -- <Cv> combinations that uniformly pronounced without /v/ --
	["b"] = true,
	["f"] = true,
	["p"] = true, 
}

local Cr = { -- consonants that can precede /r/ in a medial onset --
	["p"] = true, ["k"] = true, ["b"] = true, ["g"] = true, 
	
	["ð"] = true,
	["f"] = true, ["ħ"] = true, ["h"] = true,
	["v"] = true,
	["þ"] = true, 
	
	["t"] = true, ["d"] = true,
}

local Cl = { -- consonants that can precede /l/ in a medial onset --
	["p"] = true, ["k"] = true, ["b"] = true, ["g"] = true, 
	
	["ð"] = true,
	["f"] = true, ["ħ"] = true, ["h"] = true,
	["v"] = true,
	["þ"] = true, 
	
	["s"] = true, ["ș"] = true, ["z"] = true, ["x"] = true, 
	
}

local Cv = { -- consonants that can precede /v/ in a medial onset --
	["d"] = true, ["ð"] = true, ["dz"] = true,
	["dʒ"] = true, ["g"] = true, ["ħ"] = true, ["h"] = true,
	["k"] = true, ["l"] = true, ["r"] = true, ["s"] = true, ["ș"] = true,
	["t"] = true, ["c"] = true, ["ķ"] = true, ["ts"] = true, ["tș"] = true, 
	["z"] = true, ["x"] = true, ["þ"] = true, ["m"] = true, ["n"] = true, 
}

local CN = { -- consonants that can precede nasals in a medial onset -- (non-glide continuants)
	["ð"] = true,
	["f"] = true, ["ħ"] = true, ["h"] = true,
	["l"] = true, ["r"] = true, ["s"] = true, ["ș"] = true,
	["v"] = true,
	["z"] = true, ["x"] = true, ["þ"] = true, 
}

local irregular = {
	["eurú"] = {"eu", "r", "ú"},
	["eurov"] = {"eu", "r", "o", "v"},
}

local irregular_grapheme = { "eu", }
local irregular_vowel = { "eu", }

function graphemise(word)
	mw.log("————— GRAPHEMISING —————")

	local graphemes = {}
	
	while mw.ustring.len(word) > 0 do
		
		local limit = mw.ustring.len(word)
		
		for i = 1, limit do
			local orig_string = mw.ustring.sub(word, i)
			local check_string = mw.ustring.lower(orig_string)
			
			if irregular[check_string] then
				mw.log("Irregular spelling recognised: " .. orig_string)
				local capitals = {}
				
				-- get capital data --
				for j = 1, #orig_string do
					if mw.ustring.sub(orig_string, j, j) == mw.ustring.upper(mw.ustring.sub(check_string, j, j)) then
						capitals[j] = true
					end
				end
				
				-- ammend irregular data to match capitals -- 
				local index = 1
				for j = 1, #irregular[check_string] do
					local new_data = ""
					
					for k = 1, #irregular[check_string][j] do
						local letter = mw.ustring.sub(irregular[check_string][j], k, k)
						
						if capitals[index] then
							letter = mw.ustring.upper(letter)
						end
						
						new_data = new_data .. letter
						index = index + 1
					end
					
					irregular[check_string][j] = new_data
					
				end
				
				for j = 0, #irregular[check_string] - 1 do
					table.insert(graphemes, 1, irregular[check_string][#irregular[check_string] - j])
					mw.log("<" .. irregular[check_string][#irregular[check_string] - j] .. "> logged.")
				end
				word = mw.ustring.sub(word, 1, i - 1)
				break
			elseif grapheme[check_string] then
				table.insert(graphemes, 1, orig_string)
				mw.log("<" .. orig_string .. "> logged.")
				word = mw.ustring.sub(word, 1, i - 1)
				break
			elseif mw.ustring.len(check_string) == 1 then
				table.insert(graphemes, 1, orig_string)
				mw.log("<" .. orig_string .. "> logged.")
				word = mw.ustring.sub(word, 1, i - 1)
				break
			end
			
		end
		
	end
	
	mw.log("String exhausted.")

	mw.log("<" .. table.concat(graphemes, "><") .. ">")
	return graphemes
end

function syllabify(graphemes)
	mw.log("————— SYLLABIFYING —————")
	
	for _, g in ipairs(irregular_grapheme) do
		grapheme[g] = true
	end
	for _, g in ipairs(irregular_vowel) do
		vowel[g] = true
	end
	
	local function swap(Pos1, Pos2)
		mw.log(graphemes[Pos1] .. " ↔ " .. graphemes[Pos2] .. " at position " .. Pos1)
		local temp = graphemes[Pos1]
		graphemes[Pos1] = graphemes[Pos2]
		graphemes[Pos2] = temp
	end

	local i = 1
	
	while true do -- add σ before each vowel
		
		if graphemes[i] == nil then break end
		
		if vowel[mw.ustring.lower(graphemes[i])] then
			table.insert(graphemes, i, "σ")
			mw.log("σ inserted in position " .. i)
			i = i + 2
		else
			i = i + 1
		end
	end
	
	i = #graphemes
	while true do
		local g_current = graphemes[i]
		local g_after = graphemes[i+1]
		local g_prev = graphemes[i-1]
		local g_prev2 = graphemes[i-2]
		local g_prev3 = graphemes[i-3]
		
		if g_current then
			g_current = mw.ustring.lower(g_current)
		end
		if g_after then
			g_after = mw.ustring.lower(g_after)
		end
		if g_prev then
			g_prev = mw.ustring.lower(g_prev)
		end
		if g_prev2 then
			g_prev2 = mw.ustring.lower(g_prev2)
		end
		if g_prev3 then
			g_prev3 = mw.ustring.lower(g_prev3)
		end
		
		-- J ↔ σ
		if glide[g_prev] and g_current == "σ" and vowel[g_after] then
			swap(i, i-1)
		
		-- (J · ) ↔ σ
		elseif glide[g_prev2] and affix[g_prev] and g_current == "σ" and vowel[g_after] then
			swap(i, i-1)
			swap(i-1, i-2)
		
		-- C ↔ σ (J)
		elseif consonant[g_prev] and g_current == "σ" and (vowel[g_after] or glide[g_after]) then
			swap(i, i-1)
			
		elseif consonant[g_prev2] and affix[g_prev] and g_current == "σ" and (vowel[g_after] or glide[g_after]) then
			swap(i, i-1)
			swap(i-1, i-2)
			
		-- C Cᵥ ↔ σ v	
		
		elseif consonant[g_prev2] and Cv[g_prev] and g_current == "σ" and g_after == "v" then
			swap(i, i-1)
			
		elseif consonant[g_prev3] and Cv[g_prev2] and affix[g_prev] and g_current == "σ" and g_after == "v" then
			swap(i, i-1)
			swap(i-1, i-2)
			
		-- Bv ↔ σ v
		elseif Cv_fixed[g_prev] and g_current == "σ" and g_after == "v" then
			swap(i, i-1)
			
		elseif Cv_fixed[g_prev2] and affix[g_prev] and g_current == "σ" and g_after == "v" then
			swap(i, i-1)
			swap(i-1, i-2)
			
		-- C Cₙ ↔ σ N
		elseif consonant[g_prev2] and CN[g_prev] and g_current == "σ" and nasal[g_after] then
			swap(i, i-1)
			
		elseif consonant[g_prev3] and CN[g_prev2] and affix[g_prev] and g_current == "σ" and nasal[g_after] then
			swap(i, i-1)
			swap(i-1, i-2)
		
		-- C Cᵣ ↔ σ r
		elseif consonant[g_prev2] and Cr[g_prev] and g_current == "σ" and g_after == "r" then
			swap(i, i-1)
			
		elseif consonant[g_prev3] and Cr[g_prev2] and affix[g_prev] and g_current == "σ" and g_after == "r" then
			swap(i, i-1)
			swap(i-1, i-2)
		
		-- C Cₗ ↔ σ l
		elseif consonant[g_prev2] and Cl[g_prev] and g_current == "σ" and g_after == "l" then
			swap(i, i-1)
			
		elseif consonant[g_prev3] and Cl[g_prev2] and affix[g_prev] and g_current == "σ" and g_after == "l" then
			swap(i, i-1)
			swap(i-1, i-2)
		elseif g_current == "σ" then
			if i == 1 then
				table.remove(graphemes, 1)
			else
				for j = 1, i-1 do
					if vowel[mw.ustring.lower(graphemes[i - j])] or graphemes[i - j] == "σ" then
						break
					elseif i - j == 1 then
						table.remove(graphemes, i)
						break
					elseif not affix[graphemes[i-j]] and not consonant[mw.ustring.lower(graphemes[i-j])] then
						table.remove(graphemes, i)
						break
					end
				end
			end
			
		end
		
		if i > 1 then
			i = i - 1
		else break
		end
	end
	
	return graphemes
	
end


function export.generate(frame)
	local args = getArgs(frame)
	
	if args[1] == nil then
		error("Word needed")
	end
	
	local outputSyllables = args[1]
	
	outputSyllables = graphemise(outputSyllables)
	outputSyllables = syllabify(outputSyllables)
	
	local divider = args[2]
	
	if divider == nil then
		divider = "|"
	end

	outputSyllables = table.concat(outputSyllables)
	outputSyllables = mw.ustring.gsub(outputSyllables, "(σ)", divider)

	return outputSyllables
	
end

function export.generate_array(frame)
	local args = getArgs(frame)
	
	if args[1] == nil then
		error("Word needed")
	end
	
	local outputSyllables = args[1]
	
	outputSyllables = graphemise(outputSyllables)
	outputSyllables = syllabify(outputSyllables)
	
	return outputSyllables
	
end

return export

--[[
Debug console test string:
=p.generate(mw.getCurrentFrame():newChild{title="whatever",args={"rjaovs"}})
]]