Documentation for this module may be created at မော်ဂျူး:Hani-sortkey/doc

local export = {}

local cp = mw.ustring.codepoint
local namespace = mw.title.getCurrentTitle().nsText
local m_data = require("Module:Hani-sortkey/data/serialized")
local main_offset = m_data:find("\255")
local preconvert_data = m_data:sub(1, main_offset - 1)

local pe = require("Module:utilities").pattern_escape
local substring = mw.ustring.sub

local function log(...)
	if namespace == "Module" then
		mw.log(...)
	end
end

--[[
	The number of characters or ideographic sequences that must follow each
	ideographic description character.
]]
local IDchars = {
	["⿰"] = 2,
	["⿱"] = 2,
	["⿲"] = 3,
	["⿳"] = 3,
	["⿴"] = 2,
	["⿵"] = 2,
	["⿶"] = 2,
	["⿷"] = 2,
	["⿸"] = 2,
	["⿹"] = 2,
	["⿺"] = 2,
	["⿻"] = 2,
	
	--[[
	-- in future perhaps: https://www.unicode.org/L2/L2018/18012-irgn2273-four-new-idcs.pdf
	[mw.ustring.char(0x2FFC)] = 2,
	[mw.ustring.char(0x2FFD)] = 2,
	[mw.ustring.char(0x2FFE)] = 1,
	[mw.ustring.char(0x2FFF)] = 1,
	--]]
}

--[[
	Returns the index in the string where the ideographic description sequence
	(IDS) ends, or the index of the end of the string. Iterates whenever
	another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
	if not ( text and IDchar and i) then
		return nil
	end
	
	local j = i
	local component = 1
	
	-- Number of components expected after current IDC.
	local components = IDchars[IDchar]
	
	while component <= components do
		j = j + 1
		
		local char = substring(text, j, j)
		
		if char == "" then
			break
		elseif IDchars[char] then
			j = findEndOfIDS(text, char, j)
		end
		
		component = component + 1
	end
	
	--[[
		If the expected number of components has been found,
		return the current index in the text.
	]]
	if component - components == 1 then
		return j
	else
		return nil
	end
end

local module_cache = {}

-- The sortkey modules handle four sets of codepoints. The first set runs from [[Module:Hani-sortkey/data/001]] to [[Module:Hani-sortkey/data/056]]; then there is a gap of 23,055 codepoints. The second set is entirely contained in [[Module:Hani-sortkey/data/056]]; then there is a gap of 67,031 codepoints. The third set runs from [[Module:Hani-sortkey/data/057]] to [[Module:Hani-sortkey/data/177]]; then there is a gap of 5,152 codepoints. The fourth set runs from [[Module:Hani-sortkey/data/178]] to [[Module:Hani-sortkey/data/196]].
-- This data is then serialized by [[Module:Hani-sortkey/serializer]] into [[Module:Hani-sortkey/data/serialized]], as it uses far less memory.
function export.getData(char, returnModule)
	if type(char) == "string" then
		char = cp(char)
	elseif type(char) ~= number then
		error("getData must operate on a single character or codepoint.")
	end
	local sections = {
		{0x3007, 0x3007},
		{0x3400, 0x9FFF},
		{0xFA0E, 0xFA29},
		{0x20000, 0x2EBEF},
		{0x30000, 0x323AF}
	}
	local section_offset = 0
	for k, v in ipairs(sections) do
		if char > v[2] then
			section_offset = section_offset + v[2] - v[1] + 1
		elseif char >= v[1] and char <= v[2] then
			local start = 5 * (section_offset + char - sections[k][1]) + main_offset + 1
			return m_data:sub(start, start + 4)
		end
	end
	return mw.ustring.char(char)
end

local unsupported_data

function export.makeSortKey(text, lang, sc)
	local scripts = {
		Hani = true,
		Hans = true,
		Hant = true,
		Jpan = true,
		Kore = true
	}
	if sc and not scripts[sc] then
		return text:uupper()
	end
	
	local sort = {}
	
	text = text:gsub("[%z\1-\127\194-\244][\128-\191]*", function(character)
		return preconvert_data:match("\2" .. pe(character) .. "\1([^\2]+)\2")
	end)
	
	local i = 1
	while i <= mw.ustring.len(text) do
		local character = substring(text, i, i)
		--[=[
			If we encounter an ideographic description character (IDC),
			find out if it begins a valid ideographic description sequence (IDS).
			
			If the IDS is valid and a sortkey for it is listed in
			[[Module:Hani-sortkey/data/unsupported]], then return
			the sortkey, and move to the next character after the
			IDS.
			
			Otherwise, insert the IDC into the sortkey and move to the next
			character after the IDC.
			
			If the IDS is valid and no sortkey for it is found, track it.
		]=]
		if IDchars[character] then
			local j = findEndOfIDS(text, character, i)
			local IDS, data
			if j then
				IDS = substring(text, i, j)
				unsupported_data = unsupported_data or mw.loadData("Module:Hani-sortkey/data/unsupported")
				data = unsupported_data[IDS]
			end
			if not data then
				if IDS then
					require("Module:debug").track("Hani-sortkey/IDS-without-sortkey")
					mw.log("ideographic description sequence without sortkey: '"
						.. IDS .. "'")
				else
					require("Module:debug").track("Hani-sortkey/invalid-IDS")
					mw.log("invalid ideographic description sequence at the beginning of '"
						.. substring(text, i) .. "'")
				end
			end
			if IDS and data then
				table.insert(sort, data)
				i = j
			else
				table.insert(sort, character)
			end
		else
			table.insert(sort, export.getData(character) or character)
		end
		i = i + 1
	end
	
	sort = table.concat(sort)

	return sort
end

return export