- Subpages
- Module:XML/doc
- Module:XML/i18n
- Module:XML/i18n/doc
- Module:XML/testcases
- Module:XML/testcases/books
- Module:XML/testcases/books/dom
- Module:XML/testcases/books/print
- Module:XML/testcases/books/tree
- Module:XML/testcases/doc
- Module:XML/testcases/infobox
- Module:XML/testcases/infobox/dom
- Module:XML/testcases/infobox/print
- Module:XML/testcases/infobox/tree
- Module:XML/testcases/people
- Module:XML/testcases/people/dom
- Module:XML/testcases/people/print
- Module:XML/testcases/people/tree
--- XML parser for valid XML streams in Lua.
-- This module is a fork of the [[github:manoelcampos/xml2lua|xml2lua]]
-- library by [[github:manoelcampos|@manoelcampos]]. It is available
-- under the MIT license, as with the original library.
--
-- The parser provides a partially object-oriented API with its
-- functionality split into tokeniser and handler components.
--
-- The handler instance from @{xml.handlers} is passed to the tokeniser
-- via @{xml.parser} and receives callbacks for each XML element
-- processed (if a suitable handler function is defined). The API is
-- conceptually similar to the SAX API but implemented differently.
--
-- XML data is passed to the parser instance through the
-- @{XMLParser:parse} method. Note that the parser only accepts a
-- single string currently.
--
-- The default XML handler is @{xml.handlers.DOM}, due to its ability
-- to nondestructively parse any XML (representing comments, text nodes
-- and mixed content appropriately). The module provides a serialiser
-- supporting XML DOM root tables at @{xml.serialise}, which has a
-- compatibility layer for XML tree root tables.
--
-- If your application involves bidirectional parsing of data, such as
-- the contents of templates using Wikia's [[w:Help:Infobox|infobox
-- component]], the @{xml.handlers.DOM} handler is recommended. When
-- creating XML configuration files for use in Lua modules, it is
-- recommended to use the @{xml.handlers.Tree} handler which allows for
-- easier node traversal and data extraction.
--
-- ## Features ##
-- * Tokenises well-formed XML (relatively robustly)
-- * Flexible handler-based event API (see @{xml.handlers}
-- documentation).
-- * Parses all XML infoset elements:
-- ** Tags
-- ** Text
-- ** Comments
-- ** CDATA
-- ** XML declarations
-- ** Processing instructions
-- ** DOCTYPE declarations
-- * Provides limited well-formedness checking
-- (checks for basic syntax & balanced tags only)
-- * Flexible whitespace handling (optional)
-- * Entity handling (optional)
--
-- ## Limitations ##
-- * Shallow well-formedness checking only (fails
-- to detect most semantic errors)
-- * Non-validating
-- * No charset handling
-- * No namespace support
--
-- @script xml
-- @alias p
-- @license MIT
-- @release beta
-- @require Module:I18n
-- @require Module:Yesno
-- @version 1.3.5
-- @author Paul Chakravarti (passtheaardvark.com)
-- @author [[github:manoelcampos|Manoel Campos da Silva Filho]]
-- @author [[User:8nml|8nml]]
local p = {}
-- Module dependencies.
local i18n = require('Dev:I18n').loadMessages('XML')
local yesno = require('Dev:Yesno')
-- Module variables.
local XML_DOM_TYPE_COMMENT = 'COMMENT'
local XML_DOM_TYPE_DECL = 'DECL'
local XML_DOM_TYPE_DTD = 'DTD'
local XML_DOM_TYPE_ELEMENT = 'ELEMENT'
local XML_DOM_TYPE_TEXT = 'TEXT'
local XML_DOM_TYPE_PI = 'PI'
-- Parser utilities.
--- Converts decimal character code to character or HTML ISO code.
-- @param {number} code The decimal value to convert to its
-- respective character.
-- @return If `code` is a graphical character, the character
-- is returned. Otherwise, the HTML ISO code for
-- that decimal value is returned in the format
-- `&#code`.
-- @local
local function decimalToHtmlChar(code)
local n = tonumber(code)
if n >= 0 and n < 256 then
return string.char(n)
else
return '&#' .. code .. ';'
end
end
--- Converts hexadecimal character code to character or HTML ISO code.
-- @function hexadecimalToHtmlChar
-- @param {number} code The hexadecimal value to convert to
-- its respective character.
-- @return If `code` is a graphical character, the character
-- is returned. Otherwise, the HTML ISO code for that
-- decimal value is returned in the format `ode`.
-- @local
local function hexadecimalToHtmlChar(code)
local n = tonumber(code, 16)
if n >= 0 and n < 256 then
return string.char(n)
else
return '&#x' .. code .. ';'
end
end
--- Checks if a function/field exists in a table or in its metatable.
-- @function fexists
-- @param {table} tbl The table to test function or field
-- presence.
-- @param {string} element The function or field name to
-- check the existence of.
-- @return {boolean} Boolean for whether the function or
-- field exists.
-- @local
local function fexists(tbl, element)
if tbl == nil then
return false
end
if tbl[element] == nil then
return fexists(getmetatable(tbl), element)
else
return true
end
end
--- Error handler callback.
-- @function err
-- @param {table} self XML parser instance.
-- @param[opt] {string} err Localised error message string.
-- @param[opt] {number} pos String character position.
-- @local
local function err(self, err, pos)
if self.options.errorHandler then
self.options.errorHandler(err, pos)
end
end
--- Removes leading and trailing whitespaces from a string.
-- @function stripWS
-- @param {table} self XML parser instance.
-- @param {string} s XML text with whitespace.
-- @return {string} Trimmed string if `options.stripWS` is
-- true.
-- @local
local function stripWS(self, s)
if self.options.stripWS then
s = mw.text.trim(s)
end
return s
end
--- Parses XML entities in a string.
-- @function parseEntities
-- @param {table} self XML parser instance.
-- @param {string} str String to insert entities into.
-- @return {string} String with entities.
local function parseEntities(self, str)
if self.options.expandEntities then
for k, v in pairs(self._ENTITIES) do
str = string.gsub(str, k, v)
end
end
return str
end
--- Parses a string representing a opening XML tag.
-- @function parseTag
-- @param {table} self XML parser instance.
-- @param {string} s Opening tag text.
-- @return {table} A table describing the opening tag and
-- and its attribute nodes.
-- * `[1]` The name of the tag. (string)
-- * `[2]` is the atribute nodes of the tag.
-- (table)
-- @local
local function parseTag(self, s)
local tag = {}
tag.name = string.gsub(s, self._TAG, '%1')
tag.attrs = {}
local parseFunction = function(k, v)
tag.attrs[k] = parseEntities(self, v)
tag.attrs._ = 1
end
string.gsub(s, self._ATTR1, parseFunction)
string.gsub(s, self._ATTR2, parseFunction)
if tag.attrs._ then
tag.attrs._ = nil
else
tag.attrs = nil
end
return tag
end
--- Parses a string representing a XML declaration tag.
-- @function parseXmlDeclaration
-- @param {table} self XML parser instance.
-- @param {string} str Opening XML tag text.
-- @param {table} f Auxiliary parser variables.
-- @return {table} A table representation of the XML
-- declaration.
-- @todo Check if attributes are valid.
-- @todo Check for version (mandatory).
-- @local
local function parseXmlDeclaration(self, str, f)
-- XML declaration.
f.match, f.endMatch, f.text = string.find(str, self._PI, f.pos)
if not f.match then
err(self, i18n:msg('error-parsing-decl'), f.pos)
end
if f.match ~= 1 then
-- Must be at start of doc if present.
err(self, i18n:msg('error-parsing-declStart'), f.pos)
end
local tag = parseTag(self, f.text)
if tag.attrs and tag.attrs.version == nil then
err(self, i18n:msg('error-parsing-declAttr'), f.pos)
end
if fexists(self.handler, 'decl') then
self.handler:decl(tag, f.match, f.endMatch)
end
return tag
end
--- Parses a string representing a XML processing instruction.
-- @function parseXmlProcessingInstruction
-- @param {table} self XML parser instance.
-- @param {string} str XML processing instruction text.
-- @param {table} f Auxiliary parser variables.
-- @return {table} A table representation of the XML
-- declaration.
-- @local
local function parseXmlProcessingInstruction(self, str, f)
local tag = {}
-- XML Processing Instruction (PI)
f.match, f.endMatch, f.text = string.find(str, self._PI, f.pos)
if not f.match then
err(self, i18n:msg('error-parsing-pi'), f.pos)
end
if fexists(self.handler, 'pi') then
-- Parse PI attributes & text
tag = parseTag(self, f.text)
local pi = string.sub(f.text, string.len(tag.name) + 1)
if pi ~= '' then
if tag.attrs then
tag.attrs._text = pi
else
tag.attrs = {_text = pi}
end
end
self.handler:pi(tag, f.match, f.endMatch)
end
return tag
end
--- Parses a string representing an XML comment.
-- @function parseComment
-- @param {table} self XML parser instance.
-- @param {string} str XML comment text.
-- @param {table} f Auxiliary parser variables.
-- @local
local function parseComment(self, str, f)
f.match, f.endMatch, f.text = string.find(str, self._COMMENT, f.pos)
if not f.match then
err(self, i18n:msg('error-parsing-comment'), f.pos)
end
if fexists(self.handler, 'comment') then
f.text = parseEntities(self, stripWS(self, f.text))
self.handler:comment(f.text, next, f.match, f.endMatch)
end
end
--- Utility to parse a string representing XML DTD declarations.
-- @function _parseDtd
-- @param {table} self XML parser instance.
-- @param {string} str XML comment text.
-- @param {number} pos Character position index.
-- @return {number} Start index of match.
-- @return {number} End index of match.
-- @return {table} A table representation of the XML
-- DTD declarations.
-- @local
local function _parseDtd(self, str, pos)
-- match, endMatch, root, type, name, uri, internal
local dtdPatterns = { self._DTD1, self._DTD2, self._DTD3, self._DTD4, self._DTD5 }
for i, dtd in pairs(dtdPatterns) do
local m, e, r, t, n, u, i = string.find(str, dtd, pos)
if m then
return m, e, { _root = r, _type = t, _name = n, _uri = u, _internal = i }
end
end
return nil
end
--- Parses a string representing XML DTD declarations.
-- @function parseDtd
-- @param {table} self XML parser instance.
-- @param {string} str XML DTD declaration text.
-- @param {table} f Auxiliary parser variables.
-- @local
local function parseDtd(self, str, f)
f.match, f.endMatch, attrs = _parseDtd(self, str, f.pos)
if not f.match then
err(self, i18n:msg('error-parsing-dtd'), f.pos)
end
if fexists(self.handler, 'dtd') then
local tag = { name = 'DOCTYPE', value = string.sub(str, f.match + 10, f.endMatch - 1) }
self.handler:dtd(tag, f.match, f.endMatch)
end
end
--- Parses a string representing a XML CDATA section.
-- @function parseCdata
-- @param {table} self XML parser instance.
-- @param {string} str XML CDATA section text.
-- @param {table} f Auxiliary parser variables.
-- @local
local function parseCdata(self, str, f)
f.match, f.endMatch, f.text = string.find(str, self._CDATA, f.pos)
if not f.match then
err(self, i18n:msg('error-parsing-cdata'), f.pos)
end
if fexists(self.handler, 'cdata') then
self.handler:cdata(f.text, nil, f.match, f.endMatch)
end
end
--- Parses a normal XML tag.
-- @function parseNormalTag
-- @param {table} self XML parser instance.
-- @param {string} str XML CDATA section text.
-- @param {table} f Auxiliary parser variables.
-- @return {table} A table representation of the XML
-- tag.
-- @todo Support `>` tag in attributes.
-- @local
local function parseNormalTag(self, str, f)
-- Check for errors.
while 1 do
-- If there isn't an attribute without closing quotes (single
-- or double quotes), then break to follow the normal
-- processing of the tag.
-- Otherwise, try to find where the quotes close.
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR1)
if f.errEnd == nil then
f.errStart, f.errEnd = string.find(f.tagstr, self._ATTRERR2)
if f.errEnd == nil then
break
end
end
f.extStart, f.extEnd, f.endt2 = string.find(str, self._TAGEXT, f.endMatch + 1)
f.tagstr = f.tagstr .. string.sub(str, f.endMatch, f.extEnd - 1)
if not f.match then
err(self, i18n:msg('error-parsing-xml'), f.pos)
end
f.endMatch = f.extEnd
end
-- Extract tag name and attrs.
local tag = parseTag(self, f.tagstr)
if (f.endt1 == '/') then
if fexists(self.handler, 'endtag') then
if tag.attrs then
-- Shouldn't have any attributes in endtag
err(self, i18n:msg('error-parsing-endtag', tag.name), f.pos)
end
if table.remove(self._stack) ~= tag.name then
err(self, i18n:msg('error-parsing-unmatched', tag.name), f.pos)
end
self.handler:endtag(tag, f.match, f.endMatch)
end
else
table.insert(self._stack, tag.name)
if fexists(self.handler, 'starttag') then
self.handler:starttag(tag, f.match, f.endMatch)
end
-- Self-closing tag
if (f.endt2 == '/') then
table.remove(self._stack)
if fexists(self.handler, 'endtag') then
self.handler:endtag(tag, f.match, f.endMatch)
end
end
end
return tag
end
--- Type-agnostic XML tag parser.
-- Determines the type of a tag and parses it using the appropriate
-- subroutine above.
-- @function parseTagType
-- @param {table} self XML parser instance.
-- @param {string} str XML CDATA section text.
-- @param {table} f Auxiliary parser variables.
-- @local
local function parseTagType(self, str, f)
-- Test for tag type
if string.find(string.sub(f.tagstr, 1, 5), '?xml%s') then
parseXmlDeclaration(self, str, f)
elseif string.sub(f.tagstr, 1, 1) == '?' then
parseXmlProcessingInstruction(self, str, f)
elseif string.sub(f.tagstr, 1, 3) == '!--' then
parseComment(self, str, f)
elseif string.sub(f.tagstr, 1, 8) == '!DOCTYPE' then
parseDtd(self, str, f)
elseif string.sub(f.tagstr, 1, 8) == '![CDATA[' then
parseCdata(self, str, f)
else
parseNormalTag(self, str, f)
end
end
--- Tag parsing iterator check (first pass).
-- @function getNextTag
-- @return {boolean} Boolean for whether there is a next
-- tag.
-- @todo Fix exceptions below (multiple passes).
-- @local
local function getNextTag(self, str, f)
f.match, f.endMatch, f.text, f.endt1, f.tagstr, f.endt2 = string.find(str, self._XML, f.pos)
if not f.match then
if string.find(str, self._WS, f.pos) then
-- No more text - check document complete
if #self._stack ~= 0 then
err(self, i18n:msg('error-parsing-incomplete'), f.pos)
else
return false
end
else
-- Unparsable text
err(self, i18n:msg('error-parsing-xml'), f.pos)
end
end
f.text = f.text or ''
f.tagstr = f.tagstr or ''
f.match = f.match or 0
return f.endMatch ~= nil
end
--- Default error handler for invalid XML.
-- Throws a formatted exception message with position.
-- @function defaultErrorHandler
-- @param[opt] {string} msg Error message specifying XML item type.
-- @param[opt] {number} pos String character position.
-- @local
local function defaultErrorHandler(msg, pos)
msg = msg or i18n:msg('error-parsing')
pos = tostring(pos or 0)
error(i18n:msg('error-message-format', msg, pos))
end
-- Serialiser utilities.
--- Generates an XML attribute string from an `_attr` table.
-- @function serialiseAttr
-- @param {table} tbl Attribute table field `_attr` from
-- an XML attribute table representation.
-- @return {string} a XML String representation of the
-- tag attributes.
local function serialiseAttr(tbl)
tbl = tbl or {}
local s = ''
for k, v in pairs(tbl) do
s = s .. ' ' .. k .. '=' .. '"' .. v .. '"'
end
return s
end
-- Handler utilities.
--- DOM handler constructor.
-- @function initDOMHandler
-- @param {table|nil} options DOM handler options.
-- @constructor
-- @local
local function initDOMHandler(options)
local dom = {}
options = options or {}
dom.options = {}
dom.options.commentNode = options.commentNode == nil
and true
or yesno(options.commentNode, false)
dom.options.piNode = options.piNode == nil
and true
or yesno(options.piNode, false)
dom.options.dtdNode = options.dtdNode == nil
and true
or yesno(options.dtdNode, false)
dom.options.declNode = options.declNode == nil
and true
or yesno(options.declNode, false)
dom.current = {}
dom.current._children = { n = 0 }
dom.current._type = 'ROOT'
dom._stack = {}
return dom
end
--- Tree handler constructor.
-- @function initTreeHandler
-- @param {table|nil} options Tree handler options.
-- @constructor
-- @local
local function initTreeHandler(options)
local obj = {}
obj.root = {}
options = options or {}
obj.options = {}
obj.options.noreduce = type(options.noreduce) == 'table'
and options.noreduce
or {}
obj._stack = { obj.root, n = 1 }
return obj
end
--- Print handler constructor.
-- @function initPrintHandler
-- @param {table|nil} options Print handler options.
-- @constructor
-- @local
local function initPrintHandler(options)
local logger = {}
options = options or {}
logger.options = {}
logger.options.commentNode = options.commentNode == nil
and true
or yesno(options.commentNode, false)
logger.options.piNode = options.piNode == nil
and true
or yesno(options.piNode, false)
logger.options.dtdNode = options.dtdNode == nil
and true
or yesno(options.dtdNode, false)
logger.options.declNode = options.declNode == nil
and true
or yesno(options.declNode, false)
return logger
end
--- Gets the first key of a table.
-- @function getFirstKey
-- @param {table} tbl Table to get the first key from.
-- @return {string|number} The table's first key, nil if
-- the table is empty or `tbl` parameter if it
-- isn't a table.
-- @constructor
-- @local
local function getFirstKey(tbl)
if type(tbl) == 'table' then
for k, v in pairs(tbl) do
return k
end
return nil
end
return tbl
end
--- Class providing the actual XML parser.
-- @type XMLParser
local XmlParser = {}
XmlParser.__index = XmlParser
-- Private attributes with XML patterns.
XmlParser._XML = '^([^<]*)<(%/?)([^>]-)(%/?)>'
XmlParser._ATTR1 = '([%w-:_]+)%s*=%s*"(.-)"'
XmlParser._ATTR2 = "([%w-:_]+)%s*=%s*'(.-)'"
XmlParser._CDATA = '<%!%[CDATA%[(.-)%]%]>'
XmlParser._PI = '<%?(.-)%?>'
XmlParser._COMMENT = '<!%-%-(.-)%-%->'
XmlParser._TAG = '^(.-)%s.*'
XmlParser._LEADINGWS = '^%s+'
XmlParser._TRAILINGWS = '%s+$'
XmlParser._WS = '^%s*$'
XmlParser._DTD1 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>'
XmlParser._DTD2 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>'
XmlParser._DTD3 = '<!DOCTYPE%s.->'
XmlParser._DTD4 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>'
XmlParser._DTD5 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>'
-- Attribute pattern with non-closing double quotes (the equal sign
-- is matched non-greedly by using `=+?`).
XmlParser._ATTRERR1 = '=+?%s*"[^"]*$'
-- Attribute pattern with non-closing single quotes (the equal sign
-- is matched non-greedly by using `=+?`).
XmlParser._ATTRERR2 = '=+?%s*\'[^\']*$'
-- Closing tag pattern (e.g. `</person>` or `/>`).
XmlParser._TAGEXT = '(%/?)>'
XmlParser._ENTITIES = {
['<'] = '<',
['>'] = '>',
['&'] = '&',
['"'] = '"',
['''] = "'",
['&#(%d+);'] = decimalToHtmlChar,
['&#x(%x+);'] = hexadecimalToHtmlChar
}
--- Instantiates a XmlParser object.
-- @param {table} _handler Handler object to be used to
-- convert the XML string to another formats. See
-- the available handlers at @{xml.handlers}.
-- @param _options Options for this XmlParser instance,
-- defined in @{xml.parser}.
function XmlParser.new(_handler, _options)
local obj = {
handler = _handler,
options = _options,
_stack = {}
}
setmetatable(obj, XmlParser)
obj.__index = XmlParser
return obj
end
--- Main function which starts the XML parsing process
-- @param {string} str the XML string to parse
-- @param[opt] {boolean} parseAttributes indicates if tag
-- attributes should be parsed or not.
-- Default: `true`.
function XmlParser:parse(str, parseAttributes)
if type(self) ~= 'table' or getmetatable(self) ~= XmlParser then
error(i18n:msg('error-parser-method'))
end
if parseAttributes == nil then
parseAttributes = true
end
self.handler.parseAttributes = parseAttributes
-- Stores auxiliary parser variables such as string.find results.
local f = {
-- string.find return
match = 0,
endMatch = 0,
-- text, end1, tagstr, end2,
-- string.find parameters and auxiliar variables
pos = 1
-- startText, endText,
-- errStart, errEnd, extStart, extEnd,
}
while f.match do
if not getNextTag(self, str, f) then
break
end
-- Handle leading text
f.startText = f.match
f.endText = f.match + string.len(f.text) - 1
f.match = f.match + string.len(f.text)
f.text = parseEntities(self, stripWS(self, f.text))
if f.text ~= '' and fexists(self.handler, 'text') then
self.handler:text(f.text, nil, f.match, f.endText)
end
parseTagType(self, str, f)
f.pos = f.endMatch + 1
end
end
--- Parses an XML string into an abstract syntax tree or event trace.
-- This function includes logic to attach a handler to the XML parser,
-- making it much more convenient than @{xml.parser}.
-- @function p.parse
-- @param {string} str XML string to be parsed.
-- @param {string|table} handler Handler to use. Default:
-- `"DOM"`. Accepts the following values:
-- * @{xml.handlers.DOM|"DOM"} - DOM handler (typed).
-- * @{xml.handlers.Tree|"Tree"} - tree handler.
-- * @{xml.handlers.Print|"Print"} - parser logging.
-- * Custom handler in the form of a Lua table.
-- @param[opt] {table} parser_opts Parser configuration options.
-- Defaults are listed in @{xml.parser} options.
-- @param[opt] {table} handler_opts Handler configuration options.
-- Defaults are listed in @{xml.handler} options.
-- @error[688] 'XML handler "$handler" not found'
-- @return {table} Lua representation of XML root structure.
function p.parse(str, handler, parser_opts, handler_opts)
handler = handler or 'DOM'
parser_opts = type(parser_opts) == 'table' and parser_opts or {}
handler_opts = type(handler_opts) == 'table' and handler_opts or {}
if type(handler) ~= 'table' and not p.handlers[handler] then
error(i18n:msg('error-handler-fetch', handler))
end
local handler_obj = type(handler) == 'table'
and handler
or p.handlers[handler]:new()
local parser = p.parser(handler_obj, parser_opts)
parser:parse(str)
return handler_obj.root
end
--- Converts a Lua XML DOM tree to a XML string representation.
-- @function p.serialise
-- @param {table} tbl DOM or tree root for XML conversion.
-- This parameter is the root table generated by a
-- @{xml.handlers.DOM} or @{xml.handlers.Tree}
-- parser instance.
-- @param[opt] {number} level Only used internally, when the
-- function is called recursively to print
-- indentation.
-- @error[739] 'cannot serialise this value. Are you using a
-- handler other than "xml.handlers.DOM" and
-- "xml.handlers.Tree"?'
-- @return {string} XML string representation for table.
function p.serialise(tbl, level, name)
if type(tbl) ~= 'table' then
error(i18n:msg('error-serialise'))
-- DOM table serialiser. Very stable and supports text nodes.
elseif tbl._name and tbl._type then
local name = tbl._name
local level = level or 1
local indent = string.rep(' ', level * 4)
local ret = { '<' .. tbl._name .. serialiseAttr(tbl._attr) .. (#tbl._children ~= 0 and '>' or ' />') }
for k, v in pairs(tbl._children or {}) do
if k == 'n' then
-- Do nothing.
elseif v._type == XML_DOM_TYPE_ELEMENT then
table.insert(
ret,
#v._children == 1 and v._children[1]._type == XML_DOM_TYPE_TEXT
and (indent .. '<' .. v._name .. serialiseAttr(v._attr) .. '>' .. v._children[1]._text .. '</' .. v._name .. '>')
or indent .. p.serialise(v, level + 1)
)
elseif v._type == XML_DOM_TYPE_COMMENT then
table.insert(ret, indent .. '<!-- ' .. v._text .. ' -->')
elseif v._type == XML_DOM_TYPE_TEXT or v._type == XML_DOM_TYPE_CDATA then
table.insert(ret, indent .. v._text)
elseif v._type == XML_DOM_TYPE_PI then
table.insert(ret, indent .. '<?' .. v._name .. serialiseAttr(v._attr) .. '?>')
end
end
if #tbl._children ~= 0 then
table.insert(ret, string.rep(' ', (level - 1) * 4) .. '</' .. tbl._name .. '>')
end
return table.concat(ret, '\n')
-- Tree table serialiser. More versatile but rather unstable.
else
local level = level or -2
local first_level = level
local indent = string.rep(' ', level * 4)
local ret = level == -2 and name and { '<' .. name .. serialiseAttr(tbl._attr) .. '>' } or {}
tbl._attr = nil
for k, v in pairs(tbl) do
if type(v) == 'table' then
-- If the keys of the table are a number, it represents an array.
if type(k) == 'number' then
local attrs = serialiseAttr(v._attr)
v._attr = nil
table.insert(ret, indent .. '<' .. name .. attrs .. '>\n' .. p.serialise(v, level + 1, name) .. '\n' .. indent .. '</' .. name .. '>')
-- If not, the children tags are all single nodes of different types.
else
level = level + 1
if type(getFirstKey(v)) == 'number' then
table.insert(ret, indent .. p.serialise(v, level, k))
else
local attrs = serialiseAttr(v._attr)
v._attr = nil
table.insert(ret, indent .. '<' .. k .. attrs .. '>\n' .. p.serialise(v, level + 1, k) .. '\n' .. indent .. '</' .. k .. '>')
end
end
else
table.insert(ret, indent .. '<' .. k .. '>' .. tostring(v) .. '</' .. k .. '>')
end
end
if name and first_level == -2 then
table.insert(ret, '</' .. name .. '>\n')
end
return table.concat(ret, '\n')
end
end
--- Loads an XML file from a specified path.
-- If the file is in the Module namespace, the loader assumes the page
-- is a Lua module returning a string. Otherwise, the loader will fetch
-- the page's raw text, removing any leading non-XML comment/shebang.
-- @function p.load
-- @param {string} filepath XML file target path (including
-- namespace).
-- @error[784] 'file "$filepath" does not contain XML'
-- * The page `filepath` does not exist.
-- * The module `filepath` does not exist or does
-- not export a string.
-- @return {string} The contents of the XML file.
function p.load(filepath)
local title = mw.title.new(filepath)
local status = true
local content
if title.namespace == 828 or filepath:find('^Dev:') then
status, content = pcall(require, filepath)
else
content = title:getContent()
content = content
:gsub('^%s*#![^\n]*\n', '') -- shebang
:gsub('^%s*//[^\n]*\n', '') -- inline non-HTML comment
:gsub('^%s*/%*[^/]*/\n', '') -- multiline non-HTML comment
end
status = status and type(content) == 'string'
if status then
return mw.text.trim(content)
end
error(i18n:msg('error-file-load', filepath or ''))
end
--- Instantiates a @{XmlParser} object to parse a XML string.
-- @function p.parser
-- @param {table} handler Handler object to be used to
-- convert the XML string to another format,
-- usually from @{xml.handlers}.
-- @param[opt] {table} options Options for parsing XML.
-- @param[opt] {table} options.stripWS
-- Strip non-significant whitespace (leading or
-- trailing) and do not generate events for empty
-- text elements. Default: `true`.
-- @param[opt] {table} options.stripWS
-- @param[opt] {table} options.expandEntities
-- Expand entities (standard entities and single
-- character numeric entities only currently -
-- could be extended at runtime if a suitable DTD
-- parser added elements to the table (see
-- `XMLParser._ENTITIES`). May also be possible to
-- expand multibyre entities for UTF-8 only.
-- Default: `true`.
-- @param[opt] {table} options.errorHandler
-- Custom error handler function.
-- @return An XML parser instance used to parse the XML.
function p.parser(handler, options)
if handler == xml then
error(i18n:msg('error-parser-call'))
end
options = options or {}
options.stripWS = type(options.stripWS) == 'nil'
and true
or yesno(options.stripWS, false)
options.expandEntities = type(options.expandEntities) == 'nil'
and true
or yesno(options.expandEntities, false)
options.errorHandler = type(options.errorHandler) == 'function'
and options.errorHandler
or defaultErrorHandler
return XmlParser.new(handler, options)
end
--- Handler object, used to generate parser output.
-- @type Handler
--- Instantiates a new handler object.
-- Each instance can handle a single XML string.
-- By using such a constructor, you can parse multiple XML files in
-- the same application.
-- @function Handler:new
-- @param[opt] {table} options Handler configuration options.
-- @return {Hander} Handler object instance.
-- @note This method is not available in
-- @{xml.handlers.Print}.
--- Parses a start tag.
-- @function Handler:starttag
-- @param {table} tag A table describing the opening tag
-- and its attribute nodes.
-- @param {string} tag[1] The name of the tag.
-- @param {table} tag[2] The atribute nodes of the tag.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- Parses an end tag.
-- @function Handler:endtag
-- @param {table} tag A table describing the closing tag
-- and its attribute nodes.
-- @param {string} tag[1] The name of the tag.
-- @param {table} tag[2] The atribute nodes of the tag.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- Parses the text content of a tag.
-- @function Handler:text
-- @param {string} text Text content to process.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- Parses a comment tag.
-- @function Handler:comment
-- @param {string} text Comment text to process.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- Parses a XML processing instruction (PI) tag
-- @function Handler:pi
-- @param {table} tag A table describing the opening tag
-- and its attribute nodes.
-- @param {string} tag[1] The name of the tag.
-- @param {table} tag[2] The atribute nodes of the tag.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- Parse the XML declaration line (indicating the XML version).
-- @function Handler:decl
-- @param {table} tag A table describing the opening tag
-- and its attribute nodes.
-- @param {string} tag[1] The name of the tag.
-- @param {table} tag[2] The atribute nodes of the tag.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- Parses a DTD tag.
-- @function Handler:dtd
-- @param {table} tag A table describing the opening tag
-- and its attribute nodes.
-- @param {string} tag[1] The name of the tag.
-- @param {table} tag[2] The atribute nodes of the tag.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- Parses a CDATA section.
-- @function Handler:cdata
-- @param {string} text Text content to process.
-- @param[opt] {number} s Start index of match.
-- @param[opt] {number} e End index of match.
--- XML handlers for conversion logic in the @{xml.parser|XML parser}.
-- @table p.handlers
p.handlers = {}
--- @{Handler} to generate a DOM-like node tree structure.
-- The tree structure has a single ROOT node parent, and is capable of
-- representing any valid XML document.
-- Each node is a table comprising the fields below:
-- * `_name` - element name (string)
-- * `_type` - any of `'ROOT'`, `'ELEMENT'`, `'TEXT'`,
-- `'COMMENT'`, `'PI'`, `'DECL'`, `'DTD'` (string)
-- ** `PI` - XML Processing Instruction tag.
-- ** `DECL` - XML declaration tag
-- * `_attr` - node attributes - see callback API (table)
-- * `_parent` - parent node (table)
-- * `_children` - child nodes (table)
-- @table p.handlers.DOM
p.handlers.DOM = initDOMHandler()
p.handlers.DOM.__index = p.handlers.DOM
function p.handlers.DOM:starttag(tag)
local node = {}
node._type = XML_DOM_TYPE_ELEMENT
node._name = tag.name
node._attr = tag.attrs
node._children = { n = 0 }
if self.root == nil then
self.root = node
end
table.insert(self._stack, node)
self.current = self.current or self._stack[#self._stack]
table.insert(self.current._children, node)
self.current = node
end
function p.handlers.DOM:endtag(tag, s)
-- Container tag node for current tag.
local prev = self._stack[#self._stack]
if tag.name ~= prev._name then
error(i18n:msg('error-parsing-unmatched', s .. ':' .. tag.name))
end
table.remove(self._stack)
self.current = self._stack[#self._stack]
end
function p.handlers.DOM:text(text)
local node = {}
node._type = XML_DOM_TYPE_TEXT
node._text = text
table.insert(self.current._children, node)
end
function p.handlers.DOM:comment(text)
if not self.options.commentNode then
return
end
local node = {}
node._type = XML_DOM_TYPE_COMMENT
node._text = text
table.insert(self.current._children, node)
end
function p.handlers.DOM:pi(tag)
if not self.options.piNode then
return
end
local node = {}
node._type = XML_DOM_TYPE_PI
node._name = tag.name
node._attr = tag.attrs
table.insert(self.current._children, node)
end
function p.handlers.DOM:decl(tag)
if not self.options.declNode then
return
end
local node = {}
node._type = XML_DOM_TYPE_DECL
node._name = tag.name
node._attr = tag.attrs
table.insert(self.current._children, node)
end
function p.handlers.DOM:dtd(tag)
if not self.options.dtdNode then
return
end
local node = {}
node._type = XML_DOM_TYPE_DTD
node._name = tag.name
node._attr = tag.attrs
table.insert(self.current._children, node)
end
function p.handlers.DOM:cdata(section)
local node = {}
node._type = XML_DOM_TYPE_TEXT
node._text = '<![CDATA[' .. section .. ']]>'
table.insert(self.current._children, node)
end
--- Instantiates a new DOM handler.
-- @function p.handlers.DOM:new
-- @param {table} options Handler options for parsing.
-- @param[opt] {boolean} options.commentNode
-- Whether to include comment nodes. Default: `true`.
-- @param[opt] {boolean} options.piNode
-- Whether to include processing instruction nodes.
-- Default: `true`.
-- @param[opt] {boolean} options.dtdNode
-- Whether to include DTD declaration nodes. Default:
-- `true`.
-- @param[opt] {boolean} options.declNode
-- Whether to include XML declaration nodes. Default:
-- `true`.
-- @constructor
function p.handlers.DOM:new(options)
local obj = initDOMHandler(options)
obj.__index = self
setmetatable(obj, self)
return obj
end
--- @{Handler} to generate a natural table-based tree.
--
-- This handler supports many XML formats. The XML structure tree is
-- mapped into a recursive map of node names to child elements (as a
-- string representing text, or a table of values).
--
-- Where there is only a single child element this is inserted as a
-- named key. If there are multiple elements, these are inserted as
-- an array element (in some cases it may be preferable to always
-- insert elements as an array elment which can be specified on a
-- per element basis in the options). Attributes are inserted as a
-- child element with a key of `'_attr'`.
--
-- In general, this format is relatively useful, despite the following
-- limitations:
-- * Tag/text & CDATA elements are processed - all others are
-- ignored.
-- * `Mixed-Content` XML behaves unpredictably.
-- * If a leaf element has both a text element and attributes, the
-- text must be accessed through an array element (to provide a
-- container for the attribute).
-- @table p.handlers.Tree
p.handlers.Tree = initTreeHandler()
p.handlers.Tree.__index = p.handlers.Tree
function p.handlers.Tree:reduce(node, key, parent)
for k,v in pairs(node) do
if type(v) == 'table' then
self:reduce(v, k, node)
end
end
if #node == 1 and not self.options.noreduce[key] and node._attr == nil then
parent[key] = node[1]
else
node.n = nil
end
end
function p.handlers.Tree:starttag(tag)
local node = {}
if self.parseAttributes == true then
node._attr = tag.attrs
end
--Table in the stack representing the tag being processed
local current = self._stack[#self._stack]
if current[tag.name] then
table.insert(current[tag.name], node)
else
current[tag.name] = {node; n = 1}
end
table.insert(self._stack, node)
end
function p.handlers.Tree:endtag(tag, s)
-- Currently processed tag node in stack.
local current = self._stack[#self._stack]
-- Container for processed tag node in stack.
local prev = self._stack[#self._stack-1]
if not prev[tag.name] then
error(i18n:msg('error-parsing-unmatched', s .. ':' .. tag.name))
end
if prev == self.root then
-- Once parsing is complete, recursively reduce tree.
self:reduce(prev, nil, nil)
end
local first_key = getFirstKey(current)
table.remove(self._stack)
end
function p.handlers.Tree:text(text)
local current = self._stack[#self._stack]
table.insert(current, text)
end
function p.handlers.Tree:cdata(section)
local current = self._stack[#self._stack]
table.insert(current, '<![CDATA[' .. section .. ']]>')
end
--- Instantiates a new tree handler.
-- @function p.handlers.Tree:new
-- @param {table} options Handler options for parsing.
-- @param[opt] {table} options.noreduce Boolean map of tag
-- names that node children elements will not be
-- reduced for even if there is only one child.
-- @return {Handler} Tree handler instance.
-- @constructor
function p.handlers.Tree:new(options)
local obj = initTreeHandler(options)
obj.__index = self
setmetatable(obj, self)
return obj
end
--- @{Handler} to generate simple event tracing during parsing.
-- Outputs messages to the Scribunto console during the parse
-- process, usually for debugging purposes.
-- @table p.handlers.Print
p.handlers.Print = initPrintHandler()
p.handlers.Print.__index = p.handlers.Print
function p.handlers.Print:log(message)
if self.root == nil then
self.root = message
else
self.root = self.root .. message
end
mw.log(message)
self.root = self.root .. '\n'
end
function p.handlers.Print:starttag(message)
local message = 'Start : ' .. tag.name
if tag.attrs then
for k, v in pairs(tag.attrs) do
message = message .. '\n' .. string.format(' + %s="%s"', k, v)
end
end
self:log(message)
end
function p.handlers.Print:starttag(tag, s, e)
local message = 'Start : ' .. tag.name
if tag.attrs then
for k, v in pairs(tag.attrs) do
message = message .. '\n' .. string.format(' + %s="%s"', k, v)
end
end
self:log(message)
end
function p.handlers.Print:endtag(tag, s, e)
self:log('End : ' .. tag.name)
end
function p.handlers.Print:text(text, s, e)
self:log('Text : ' .. text)
end
function p.handlers.Print:cdata(text, s, e)
self:log('CDATA : ' .. text)
end
function p.handlers.Print:comment(text, s, e)
self:log('Comment : ' .. text)
end
function p.handlers.Print:dtd(tag, s, e)
local message = 'DTD : ' .. tag.name
if tag.attrs then
for k, v in pairs(tag.attrs) do
message = message .. '\n' .. string.format(' + %s="%s"', k, v)
end
end
self:log(message)
end
function p.handlers.Print:pi(tag, s, e)
local message = 'PI : ' .. tag.name
if tag.attrs then
for k, v in pairs(tag.attrs) do
message = message .. '\n' .. string.format(' + %s="%s"', k, v)
end
end
self:log(message)
end
function p.handlers.Print:decl(tag, s, e)
local message = 'XML Decl : '..tag.name
if tag.attrs then
for k, v in pairs(tag.attrs) do
message = message .. '\n' .. string.format(' + %s="%s"', k, v)
end
end
self:log(message)
end
--- Instantiates a new Print handler.
-- @function p.handlers.Print:new
-- @param {table} options Handler options for parsing.
-- @param[opt] {boolean} options.commentNode
-- Whether to include comment nodes. Default: `true`.
-- @param[opt] {boolean} options.piNode
-- Whether to include processing instruction nodes.
-- Default: `true`.
-- @param[opt] {boolean} options.dtdNode
-- Whether to include DTD declaration nodes. Default:
-- `true`.
-- @param[opt] {boolean} options.declNode
-- Whether to include XML declaration nodes. Default:
-- `true`.
-- @constructor
function p.handlers.Print:new(options)
local obj = initPrintHandler(options)
obj.__index = self
setmetatable(obj, self)
return obj
end
return p