--- spelling-stage-3.lua --- Copyright 2012, 2013 Stephan Hennig -- -- This work may be distributed and/or modified under the conditions of -- the LaTeX Project Public License, either version 1.3 of this license -- or (at your option) any later version. The latest version of this -- license is in http://www.latex-project.org/lppl.txt -- and version 1.3 or later is part of all distributions of LaTeX -- version 2005/12/01 or later. -- -- See file README for more information. -- --- Store the text of a LuaTeX document in a text document data --- structure. -- This module provides means to extract text from a LuaTeX document and -- to store it in a text document data structure. -- -- In the text document, words are stored as UTF-8 encoded strings. A -- mapping mechanism is provided by which, during word string -- recognition, individual code-points, e.g., of glyph nodes, can be -- translated to arbitrary UTF-8 strings, e.g., ligatures to single -- letters. -- -- @author Stephan Hennig -- @copyright 2012, 2013 Stephan Hennig -- @release version 0.41 -- -- @trick Prevent LuaDoc from looking past here for module description. --[[ Trick LuaDoc into entering 'module' mode without using that command. module(...) --]] -- Module table. local M = {} -- Import external modules. local recurse = require('spelling-recurse') -- Function short-cuts. local recurse_node_list = recurse.recurse_node_list local tabinsert = table.insert local tabremove = table.remove -- Short-cuts for constants. local WHATSIT = node.id('whatsit') local LOCAL_PAR = node.subtype('local_par') local USER_DEFINED = node.subtype('user_defined') -- Declare local variables to store references to resources that are -- provided by external code. -- -- Text document data structure. local __text_document -- -- ID of user-defined whatsit nodes marking the start of a word. local __uid_start_tag -- -- ID of user-defined whatsit nodes marking the end of a word. local __uid_end_tag --- Module options. -- This table contains all module options. User functions to set -- options are provided. -- -- @class table -- @name __opts -- @field table_par When processing a table, when should paragraphs be -- inserted into the text document?
-- -- local __opts = { table_par, } --- Set table behaviour. -- Determine when paragraphs are inserted within tables. -- -- @param value New value. local function set_table_paragraphs(value) __opts.table_par = value end M.set_table_paragraphs = set_table_paragraphs --- Data structure that stores the word strings found in a node list. -- -- @class table -- @name __curr_paragraph local __curr_paragraph --- Act upon detection of end of current word string. -- If the current word contains visible characters, store the current -- word in the current paragraph. -- -- @param n String tag node. local function __finish_current_word(n) -- Provide new empty paragraph, if necessary. if not __curr_paragraph then __curr_paragraph = {} end -- Append current string to current paragraph. tabinsert(__curr_paragraph, n.value) end --- Act upon detection of end of current paragraph. -- If the current paragraph contains words, store the current paragraph -- in the text document. local function __finish_current_paragraph() -- Finish a paragraph? if __curr_paragraph then -- Append current paragraph to document structure. tabinsert(__text_document, __curr_paragraph) __curr_paragraph = nil end end --- Paragraph management stack. -- Stack of boolean flags, that are used for logging the occurence of a -- new paragraph within nested vlists. local __is_vlist_paragraph --- Paragraph management. -- This function puts a new boolean flag onto a stack that is used to -- log the occurence of a new paragraph, while recursing into the coming -- vlist. After finishing recursing into the vlist, the flag needs to -- be removed from the stack. Depending on the flag, the then current -- paragraph can be finished. local function __vlist_pre_recurse() tabinsert(__is_vlist_paragraph, false) end --- Paragraph management. -- Remove flag from stack after recursing into a vlist. If necessary, -- finish the current paragraph. local function __vlist_post_recurse() local p = tabremove(__is_vlist_paragraph) if p then __finish_current_paragraph() end end --- Handle tables lines and cells. -- Start a new paragraph before and after an hlist of subtype `alignment -- column or row` or `alignment cell`, depending on option `table_par`. -- -- @param n hlist node. local function __handle_table(n) local subtype = n.subtype local table_par = __opts.table_par if (subtype == 4) and (table_par == 1) then __finish_current_paragraph() elseif (subtype == 5) and (table_par == 2) then __finish_current_paragraph() end end --- Find paragraphs and strings. -- While scanning a node list, this call-back function finds nodes -- representing the start of a paragraph (local_par whatsit nodes) and -- string tags (user_defined whatsit nodes). -- -- @param head Head node of current branch. -- @param n The current node. local function __visit_node(head, n) local nid = n.id -- Test for node containing a word string. if nid == WHATSIT then -- Test for word string tag. if (n.subtype == USER_DEFINED) and (n.user_id == __uid_end_tag) then __finish_current_word(n) -- Test for paragraph start. elseif n.subtype == LOCAL_PAR then __finish_current_paragraph() __is_vlist_paragraph[#__is_vlist_paragraph] = true end end end --- Table of call-back functions for node list recursion: store the --- word strings found in a node list. -- The call-back functions in this table identify chains of nodes -- representing word strings in a node list and stores the strings in -- the text document. A new paragraph is started at local_par whatsit -- nodes and after finishing a vlist containing a local_par whatsit -- node. Nodes of type `hlist` are recursed into as if they were -- non-existent. As an example, the LaTeX input `a\mbox{a b}b` is -- recognized as two strings `aa` and `bb`. -- -- @class table -- @name __cb_store_words -- @field vlist_pre_recurse Paragraph management. -- @field vlist_post_recurse Paragraph management. -- @field hlist_pre_recurse Table management. -- @field hlist_post_recurse Table management. -- @field visit_node Find nodes representing paragraphs and words. local __cb_store_words = { vlist_pre_recurse = __vlist_pre_recurse, vlist_post_recurse = __vlist_post_recurse, hlist_pre_recurse = __handle_table, hlist_post_recurse = __handle_table, visit_node = __visit_node, } --- Process node list according to this stage. -- This function recurses into the given node list, finds strings in -- tags and stores them in the text document. -- -- @param head Node list. local function __process_node_list(head) recurse_node_list(head, __cb_store_words) -- Clean-up left-over word and/or paragraph. __finish_current_paragraph() end -- Call-back status. local __is_active_storage --- Call-back function that processes the node list. -- This function is not made available in the module table, but in -- the global package table! -- -- @param head Node list. local function cb_AtBeginShipout(box) if __is_active_storage then __process_node_list(tex.box[box]) end end --- Start storing text. -- After calling this function, text is stored in the text document. local function enable_text_storage() __is_active_storage = true end M.enable_text_storage = enable_text_storage --- Stop storing text. -- After calling this function, no more text is stored in the text -- document. local function disable_text_storage() __is_active_storage = false end M.disable_text_storage = disable_text_storage --- Module initialisation. -- local function __init() -- Get local references to package ressources. __text_document = PKG_spelling.res.text_document __uid_start_tag = PKG_spelling.res.whatsit_ids.start_tag __uid_end_tag = PKG_spelling.res.whatsit_ids.end_tag -- Make \AtBeginShipout function available in package table. PKG_spelling.cb_AtBeginShipout = cb_AtBeginShipout -- Create empty paragraph management stack. __is_vlist_paragraph = {} -- Remember call-back status. __is_active_storage = false -- Set default table paragraph behaviour. set_table_paragraphs(0) end -- Initialize module. __init() -- Return module table. return M