#! /usr/bin/env python # Copyright 2020 Steinar Knutsen # # Licensed under the EUPL, Version 1.2 or - as soon they will be approved by the # European Commission - subsequent versions of the EUPL (the "Licence"); You may # not use this work except in compliance with the Licence. You may obtain a copy # of the Licence at: # # https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12 # # Unless required by applicable law or agreed to in writing, software # distributed under the Licence is distributed on an "AS IS" basis, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # Licence for the specific language governing permissions and limitations under # the Licence. from collections import deque from enum import Enum, auto from functools import reduce from operator import and_ import sys, html, io, getopt # Do note: Only curly brackets surrounded by whitespace need to be quoted, as # the lexer is based on space, not character classes. This makes automatic # quoting contraintuitively easy. HELP_TEXT = """\ hcml2xhtml [-h] [-n] [-f FOOTER Read "Highly Compatible Markup Language" from STDIN and write XHTML to STDOUT. -h Output this text and exit. -n Automatically output numbering for section headers. -f FOOTER Add a paragraph containing FOOTER at the end of the generated document. """ class TokenType(Enum): WORD = auto() WHITESPACE = auto() END_OF_STREAM = auto() class Token: def __init__(self, kind, content): self.kind = kind self.content = content # \ escapes space and backslash # escaped space is simply a word character class Lexer: def __init__(self, inputstream): self.tokenbuffer = deque() self.inputstream = inputstream self.inputbuffer = deque() def pushback(self, token): """Put a token back to the front of the stream of tokens""" self.tokenbuffer.append(token) def next(self): """Return the next token and move the read pointer ahead""" t = self.next_w_space() while t.kind == TokenType.WHITESPACE: t = self.next_w_space() return t def next_w_space(self): """Return the next token, including whitespace""" # TODO just a next checking type self.__ensure_token() return self.tokenbuffer.pop() def close(self): """Close the input stream""" self.inputstream.close() def __next_char(self): if len(self.inputbuffer) > 0: return self.inputbuffer.pop() else: return self.inputstream.read(1) def __ensure_token(self): if len(self.tokenbuffer) > 0: return charbuffer = io.StringIO() c = self.__next_char() if len(c) == 0: self.tokenbuffer.appendleft(Token(TokenType.END_OF_STREAM, '')) return if c.isspace(): firstchar = c charbuffer.write(c) while (c := self.__next_char()) == firstchar: charbuffer.write(c) if len(c) > 0: self.inputbuffer.append(c) self.tokenbuffer.appendleft(Token(TokenType.WHITESPACE, charbuffer.getvalue())) else: if c == '\\': c = self.__next_char() # TODO proper error handling assert len(c) > 0 charbuffer.write(c) else: charbuffer.write(c) c = self.__next_char() while len(c) > 0 and not c.isspace(): if c == '\\': c = self.__next_char() # TODO proper error handling assert len(c) > 0 charbuffer.write(c) c = self.__next_char() if len(c) > 0: self.inputbuffer.append(c) self.tokenbuffer.appendleft(Token(TokenType.WORD, charbuffer.getvalue())) class ElementType(Enum): STRING = auto() LITERAL = auto() ATTRIBUTE = auto() PARAGRAPH = auto() ANCHOR = auto() HEADER = auto() MONOSPACE = auto() LIST = auto() DEFINED_TERM = auto() DEFINITION = auto() LIST_ITEM = auto() TEXT_ONLY = (ElementType.STRING, ElementType.LITERAL, ElementType.ATTRIBUTE) class Element: def __init__(self, kind, content): self.kind = kind self.content = content def element_to_string(element): if element.kind == ElementType.STRING: return element.content elif element.kind == ElementType.LITERAL: return html.escape(element.content.content, False) elif element.kind == ElementType.ATTRIBUTE: return element.content elif element.kind == ElementType.ANCHOR: return element.content assert False, "Unexpected ElementType" def element_to_attribute(element): if element.kind == ElementType.STRING: return element.content.replace('"', '"') elif element.kind == ElementType.LITERAL: return html.escape(element.content.content, True) elif element.kind == ElementType.ATTRIBUTE: return element.content assert False, "Unexpected ElementType" def is_whitespace_literal(element): if element.kind == ElementType.LITERAL \ and element.content.kind == TokenType.WHITESPACE: return True return False def normalize(elements, normalizer): normalized = [] for element in elements: if element.kind == ElementType.LITERAL: if element.content.kind == TokenType.WHITESPACE: if element.content.content[0] == '\n': normalized.append('\n') else: normalized.append(' ') else: normalized.append(normalizer(element)) else: normalized.append(normalizer(element)) return normalized def attributenormalise(elements): return normalize(elements, element_to_attribute) def stringnormalize(elements): return normalize(elements, element_to_string) def check_input(input_ok): if not input_ok: raise ValueError("Unexpected input") def collapse_whitespace(lastspaceindex, elements): i = 0 while i <= lastspaceindex and (i + 1) < len(elements): if not is_whitespace_literal(elements[i]): i += 1 continue if is_whitespace_literal(elements[i + 1]): del elements[i + 1] else: i += 1 HEAD_TEXT = """ """ SEPARATE_HEAD_BODY = "\n" END_HTML = "\n" class Converter(Lexer): def __init__(self, inputfile, enumerate_headers=False, footertext=''): super().__init__(inputfile) self.enumerate_headers = enumerate_headers self.major = 0 self.minor = 0 self.commands = { '|': self.paragraph, '||': self.concat, '_': self.concat_w_sep, '<': self.bra, '>': self.ket, 'a': self.anchor, 'H': self.header, 'h': self.subheader, 'm': self.monospace, 'D': self.definitionlist, 't': self.defined_term, 'd': self.definition, 'T': self.title, 'O': self.orderedlist, 'L': self.unorderedlist, '-': self.listitem, 'i': self.image} self.documentbuffer = [] self.headerbuffer = [] self.footerbuffer = [] self.got_title = False if len(footertext) > 0: self.footerbuffer.append(footertext) def close(self): self.__parse_stream() super().close() def document_as_string(self): check_input(self.got_title) result = io.StringIO() result.write(HEAD_TEXT) for x in self.headerbuffer: result.write(x) result.write(SEPARATE_HEAD_BODY) for element in self.documentbuffer: result.write(element.content) for x in self.footerbuffer: result.write(x) result.write(END_HTML) return result.getvalue() def __parse_stream(self): token = self.next() while token.kind != TokenType.END_OF_STREAM: if token.content == "{": element = self.resolve_command() check_input(element.kind in (ElementType.PARAGRAPH, ElementType.HEADER, ElementType.MONOSPACE, ElementType.LIST)) self.documentbuffer.append(element) else: raise SyntaxError("Orphan data") token = self.next() def resolve_command(self): commandname = self.next() command = self.commands[commandname.content] elements = [] token = self.next_w_space() first = True while not (token.kind == TokenType.WORD and token.content == '}'): if token.content == "{": elements.append(self.resolve_command()) elif first and token.kind == TokenType.WHITESPACE: first = False else: elements.append(Element(ElementType.LITERAL, token)) token = self.next_w_space() while len(elements) > 0 and is_whitespace_literal(elements[-1]): del elements[-1] return command(elements) def paragraph(self, elements): p = ["

\n"] check_input(reduce(and_, [e.kind in TEXT_ONLY or e.kind == ElementType.ANCHOR for e in elements])) p.extend(stringnormalize(elements)) p.append("\n

\n") return Element(ElementType.PARAGRAPH, "".join(p)) def concat_w_sep(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY for e in elements])) text = stringnormalize(elements) return Element(ElementType.STRING, "".join(text)) def concat(self, elements): buffer = [] for element in elements: # Don't accept tag types, otherwise it can hide tags. # This leads to a problem creating links from substrings, but I can # live with that. check_input(element.kind in TEXT_ONLY) if element.kind == ElementType.LITERAL: if element.content.kind == TokenType.WORD: buffer.append(html.escape(element.content.content, False)) else: buffer.append(element.content) return Element(ElementType.STRING, "".join(buffer)) def bra(self, elements): check_input(len(elements) == 0) return Element(ElementType.STRING, '{') def ket(self, elements): check_input(len(elements) == 0) return Element(ElementType.STRING, '}') def anchor(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY for e in elements])) collapse_whitespace(1, elements) link = element_to_attribute(elements[0]) check_input(is_whitespace_literal(elements[1])) check_input(len(elements) >= 3) linktext = "".join(stringnormalize(elements[2:])) return Element(ElementType.ANCHOR, f'{linktext}') def header(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY for e in elements])) text = "".join(stringnormalize(elements)) self.major += 1 self.minor = 0 if self.enumerate_headers: text = f"{self.major}.{self.minor} " + text return Element(ElementType.HEADER, f'

{text}

\n') def subheader(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY for e in elements])) text = "".join(stringnormalize(elements)) self.minor += 1 if self.enumerate_headers: text = f"{self.major}.{self.minor} " + text return Element(ElementType.HEADER, f'

{text}

\n') def monospace(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY or e.kind == ElementType.ANCHOR for e in elements])) pre = ["
"]
        pre.extend([element_to_string(element) for element in elements])
        pre.append("
\n") return Element(ElementType.MONOSPACE, "".join(pre)) def definitionlist(self, elements): dl = ['
'] for element in elements: if is_whitespace_literal(element): continue check_input(element.kind in (ElementType.DEFINED_TERM, ElementType.DEFINITION)) dl.append(element.content) dl.append('
\n') return Element(ElementType.LIST, "\n".join(dl)) def defined_term(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY for e in elements])) dt = ["
"] dt.extend(stringnormalize(elements)) dt.append('
') return Element(ElementType.DEFINED_TERM, "".join(dt)) def definition(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY or e.kind == ElementType.ANCHOR for e in elements])) dd = ["
"] dd.extend(stringnormalize(elements)) dd.append('
') return Element(ElementType.DEFINITION, "".join(dd)) def title(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY for e in elements])) text = "".join(stringnormalize(elements)) self.got_title = True self.headerbuffer.append(f"{text}\n") return Element(ElementType.HEADER, f'

{text}

\n') def image(self, elements): collapse_whitespace(3, elements) check_input(elements[0].kind in TEXT_ONLY) check_input(is_whitespace_literal(elements[1])) check_input(elements[2].kind in TEXT_ONLY) check_input(is_whitespace_literal(elements[3])) check_input(reduce(and_, [e.kind in TEXT_ONLY or e.kind == ElementType.ANCHOR for e in elements[4:]])) check_input(len(elements) >= 5) src = element_to_attribute(elements[0]) alt = element_to_attribute(elements[2]) fulltext = "".join(stringnormalize(elements[4:])) return Element(ElementType.PARAGRAPH, f'

{alt}\n
\n{fulltext}

\n') def orderedlist(self, elements): ol = ['
    '] for element in elements: if is_whitespace_literal(element): continue check_input(element.kind == ElementType.LIST_ITEM) ol.append(element.content) ol.append('
\n') return Element(ElementType.LIST, "\n".join(ol)) def unorderedlist(self, elements): ul = ['\n') return Element(ElementType.LIST, "\n".join(ul)) def listitem(self, elements): check_input(reduce(and_, [e.kind in TEXT_ONLY or e.kind == ElementType.ANCHOR for e in elements])) li = ["
  • "] li.extend(stringnormalize(elements)) li.append('
  • ') return Element(ElementType.LIST_ITEM, "".join(li)) def main(): options, arguments = getopt.gnu_getopt(sys.argv[1:], "hnf:") assert len(arguments) == 0 enumerate_headers = False footertext = '' for option, value in options: if option == "-h": print(HELP_TEXT, end="") sys.exit(0) elif option == "-n": enumerate_headers = True elif option == "-f": footertext = "

    {footer}

    \n".format(footer=html.escape(value)) converter = Converter(sys.stdin, enumerate_headers, footertext) converter.close() sys.stdout.write(converter.document_as_string()) if __name__ == "__main__": main()