diff --git a/.gitignore b/.gitignore index 8aa3d05..5663c89 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ public/ *.pdf *.jpeg *.svg +!test/1px.png diff --git a/README.md b/README.md new file mode 100644 index 0000000..9bdc0ff --- /dev/null +++ b/README.md @@ -0,0 +1,376 @@ +--- +language: en +highlight-style: native +--- + +# Formátítko 2.0 +A python program based on [pandoc](https://pandoc.org/) and its python library +[panflute](http://scorreia.com/software/panflute) for converting from markdown +to TeX and HTML with added fancy features like image processing, python-based +macros and much more. + +## Requirements +This project requires `panflute 2.3.0` that itself requires `pandoc 3.0`. If the +version of `pandoc` doesn't match, very weird things can happen. ImageMagick and +Inkscape are used for image processing. Nodejs is used for KaTeX. + +## Usage +``` +usage: formatitko.py [-h] [-l IMG_LOOKUP_DIRS [IMG_LOOKUP_DIRS ...]] [-p IMG_PUBLIC_DIR] [-i IMG_WEB_PATH] [-w OUTPUT_HTML] [-t OUTPUT_TEX] input_filename + +positional arguments: + input_filename The markdown file to process. + +options: + -h, --help show this help message and exit + -l IMG_LOOKUP_DIRS [IMG_LOOKUP_DIRS ...], --img-lookup-dirs IMG_LOOKUP_DIRS [IMG_LOOKUP_DIRS ...] + Image lookup directories. When processing images, the program will try to find the image in them first. Always looks for images in the same folder as the markdown + file. (default: []) + -p IMG_PUBLIC_DIR, --img-public-dir IMG_PUBLIC_DIR + Directory to put processed images into. The program will not overwrite existing images. (default: public) + -i IMG_WEB_PATH, --img-web-path IMG_WEB_PATH + Path where the processed images are available on the website. (default: /) + -w OUTPUT_HTML, --output-html OUTPUT_HTML + The HTML file (for Web) to write into. (default: output.html) + -t OUTPUT_TEX, --output-tex OUTPUT_TEX + The TEX file to write into. (default: output.tex) +``` + +## Format +Formátítko uses all the default pandoc markdown extensions except for +definition lists and citations. It also adds its own custom features. + +## Features + +### Hiding and showing elements based on flags + +Flags can be set in the Front Matter or with python code. Then, elements with +the `if` attribute will only be shown if the flag is set to True and elements +with the `ifn` attribute will only be show if the flag is not set to True. + +**Example:** + +```markdown {.group} +--- +flags: + foo: true +--- +[This will be shown]{if=foo} + +[This will not be shown]{if=bar} + +[This will be shown]{ifn=bar} +``` + +### Including other files + +There are two ways of including files. + +#### Importing +The first is importing, which only takes the state (defined commands, metadata, +etc.) from the file and any content is omitted. This is useful for creating +libraries of commands. The syntax is as follows: + +[#test/empty.md]{} + +The curly braces are required for pandoc to parse the import properly and should +be left empty. + +#### Partials +Partials are the very opposite of imports, they have their own context, which +inherits everything from the context they're included in, but gets reset after +the file ends. + +:::{partial=test/empty.md} +::: + +If the `untrusted` attribute is set to True, the partial and all its children +will not be able to define commands or run inline blocks (but it will be able to +run commands defined in the parent). ^[Please don't trust this for any security +though, we're playing with *eval* fire, this will never be secure.] + +You can also import raw HTML and TeX if you set the `type` attribute of the +partial to `tex` or `html`. + +### Groups + +Groups are pieces of markdown with their own sandboxed context, in other words, +inline partials. They function exactly the same as partials, namely can have +their own front matter. + +```markdown {.group} +--- +language: cs +--- +OOOoo český mód +``` + +If you need to nest groups or have code blocks inside groups, you can increase +the amount of backticks around the outer block: + +````markdown {.group} +```go +fmt.Pritln("owo") +``` +```` + +Groups and partials are also enclosed in `\begingroup` and `\endgroup` in the +output TeX. + +### Raw HTML and TeX ^[This is a pandoc feature] +If raw HTML or TeX is included in the markdown file, it will automagically pop +out into the respective output file. + +red text + +\vskip1em + +This has the advantage and disadvantage of being very *"automagic"*, which means +that for instance markdown inside HTML will still get interpreted as markdown. +It is however very very unreliable, so in most cases, you should use explicit +raw blocks with the unnamed attribute set to either `html` or `tex`. ^[Still a +pandoc feature.] + +``` {=html} +red text +``` + +### Running python code + +Formátítko allows you to run Python code directly from your MD file. Any +`python` code block with the class `run` will be executed: + +#### Context + +You can access the current context using the `ctx` variable. The context +provides read/write access to the FrontMatter metadata. The context has the +following methods: + +`ctx.get_metadata(key: str, simple: bool=True, immediate: bool=False)` + +- `key`: The key of the metadatum you want to get. Separate child keys with + dots: `ctx.get_metadata("flags.foo")` +- `simple`: Whether to use python's simple builtin types or panflute's + MetaValues. MetaValues can contain formatted text, simple values loose all + formatting. +- `immediate`: Only get metadatum from the current context, not from its + parents. + +`ctx.set_metadata(key: str, value)` + +- `key`: The key of the metadatum you want to get. Separate child keys with + dots: `ctx.get_metadata("flags.foo")` +- `value`: Any value you want to assign to the metadatum + +`ctx.unset_metadata(key: str)` + +Delete the metadatum in the current context and allow it to inherit the value +from the parent context. + +- `key`: The key of the metadatum you want to get. Separate child keys with + dots: `ctx.get_metadata("flags.foo")` + +Helper functions for flags exist which work the same as for metadata: + +`ctx.is_flag_set(flag: str) -> bool` + +`ctx.set_flag(flag: str, val: bool)` + +`ctx.unset_flag(flag: str)` + +#### Writing output + +There are two modes of writing output, plaintext and element-based. + +Plaintext mode uses the `print(text: str)` and `println(text: str)` functions, +that append text to a buffer which is then interpreted as markdown input. + +Element-based mode uses the `appendChild(element: pf.Element)` and +`appendChildren(*elements: List[pf.Element])` functions which allow you to +append `panflute` elements to a list which is then again interpreted as input. +The `panflute` library is available as `pf`. + +When one of these functions is called, the mode is set and functions from the +other mode cannot be called within the same block of code. + +**Examples:** + +````markdown {.group} +--- +title: Foo +--- +```python {.run} +println("*wooo*") +println() +println("The title of this file is: " + ctx.get_metadata("title")) +``` +```` + +```python {.run} +appendChild(pf.Para(pf.Strong(pf.Str("foo")))) +``` + +### Defining and running commands + +Code blocks can be also saved and executed later. Defining is done using the +`define` attribute: + +**Example:** + +```python {define=commandname} +print("foo") +``` + +If you try to define the same command twice, you will get an error. To redefine +a command, use the `define` attribute instead of `redefine`. + +### Running defined commands + +There are multiple ways of running commands. There is the shorthand way: + +[!commandname]{} + +Or using the `c` attribute on a span or a div: + +[Some content]{c=commandname} + +:::{c=commandname} +Some content +::: + +To access the content or attributes of the div or span the command has been +called on, the `element` variable is available, which contains the `panflute` +representation of the element. + +**Example:** + +```python {define=index} +appendChild(element.content[int(element.attributes["i"])]) +``` + +[Pick the third element from this span]{c=index i=2} + +### Direct metadata print +Metadata can be printed directly using a shorthand. The advantage of this is it +keeps the formatting from the metadatum's definition + +```markdown {.group} +--- +a: + b: some text with **strong** +--- +[$a.b]{} +``` + +### Syntax highlighting +Formátítko uses [pygments](https://pygments.org/) to highlight syntax in code +blocks. To turn it off for a single block, don't specify a language or set the +`highlight` attribute to `False`. You can also set the metadatum `highlight` to +`false` in the FrontMatter to disable it in a given Group. To change the [highlighting +style](https://pygments.org/styles/), you have to set the `highlight-style` +metadatum in the **top-level document** this is to prevent the need for many +inline style definitions. + +**Examples:** +```python +print("cool") +``` + +```zsh {highlight=False} +./formatitko.py README.md +``` + +### Language awareness +Formátítko is language aware, this means that the `language` metadatum is +somewhat special. When set using the front matter, it is also popped out to TeX +as a `\languagexx` macro. Currently supported values are `cs` and `en` for +internal uses but can be set to anything. + +### NBSP +Formátítko automatically inserts no-break spaces according to its sorta smart +rules. (See the `whitespace.py` file for more info) These rules **depend on the +chosen language**. (`cs` has some additional rules) + +To insert a literal no-break space, you can either insert the unicode no-break +space or use the html escape. + +Enforcing a breakable space is not as painless, you should insert a​ zero-width +space beside the normal​ space. + +### Smart quotes +Quotes get automatically converted to the slanted ones according to the current +language. + +**Examples:** + +```markdown {.group} +--- +language: cs +--- +"Uvozovky se v českém testu píší 'jinak' než v angličtině." +``` + +"In Czech texts, quotes are written 'differently' than in English" + +### Math +Math blocks get automatically converted to HTML using $Ka\TeX$ and fall out +unchanged into TeX output. + +To make KaTeX as consistent with TeX as possible, the `\begingroup` and +`\endgroup` that are produced by [Groups](#groups) are also emulated in the +KaTeX environment, so macro definitions should be isolated as you expect. + +### Images + +#### Figures +Pandoc's [implicit +figures](https://pandoc.org/MANUAL.html#extension-implicit_figures) are enabled, +so images which are alone in a paragraph are automatically converted to figures: + +![A single pixel image, wow!](test/1px.png "This is the alt text shown to screen readers (it defaults to the caption)"){width=10em} + +To prevent this, add a backslash at the end of the line with the image: + +![A single pixel image, wow!](test/1px.png "This is the alt text shown to screen readers"){width=10em}\ + +#### Image gathering +Images are automatically searched for in the directory where each markdown file is +(including partials) and also in directories listed in the `--lookup-dirs` +command line parameter. After processing, they're all put into the folder +specified with `--public-dir`. + +#### Image processing +Images are automatically processed so that they can be successfully used in both +output formats. This includes generating multiple sizes and providing a +[srcset](https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images). + +To customize this, the `file-width`, `file-height`, `file-dpi`, `file-quality` +and `no-srcset` attributes are available. All but the last one should be +integers. + +Keep in mind that the processing tries to be as lazy as possible, so it never +overwrites any files and if it finds the right format or resolution (only +judging by the filenames) in the lookup directories it will just copy that. This +means that any automatic attempts at conversion can be overridden by converting +the file yourself, naming it accordingly and placing it either in the public or +one of the lookup directories. + +## Working with the produced output + +### HTML +The HTML should be almost usable as-is. The styles for synstax-highlighting are +added automatically. The styles for KaTeX however are not and should be added in +your ``^[This is taken directly from [KaTeX's docs](https://katex.org/docs/browser.html)]: + +```html + +``` + +Also the output HTML is not intended as a standalone file but should be included +as part of a larger template. (That includes a doctype, other css, etc.) + +### TeX +The TeX output is not usable as is. Many of the elements are just converted to +macros, which you have to define yourself. There is an example implementation in +`formatitko.tex`, which uses LuaTeX and the ucwmac package, but you should +customize it to your needs (and to the context in which the output is used). diff --git a/command.py b/command.py index 358d4a2..1128a17 100644 --- a/command.py +++ b/command.py @@ -9,6 +9,8 @@ from mj_show import show class Command: pass +# This distinction is needed because while transforming the tree, inline +# elements cannot be replaced with block ones class InlineCommand(Span, Command): def replaceSelf(self, content: List[Element]) -> Span: try: @@ -25,7 +27,8 @@ class BlockCommand(Div, Command): return Div(*content) pass - +# This function is called in trasform.py, defining a command which can be +# called later using the function below def handle_command_define(e: Element, c: Context): if "define" in e.attributes: if not c.get_command(e.attributes["define"]): @@ -38,6 +41,23 @@ def handle_command_define(e: Element, c: Context): return nullify(e) return e +# This function executes commands and inline runnable code blocks (see +# transform.py for their syntax). Context can be accessed using `ctx` and there +# are four functions available to create output from these commands and the +# element the command has been called on (including its .content) can be +# accessed using `element`. Arguments can be passed down to the comand using +# the element's attributes. +# +# print and println append text to a buffer which is then interpreted as +# markdown with the current context. +# +# appendChild and appendChildren append panflute elements to a list which is +# then transformed. A command which does nothing looks like this: +# ```python {define=nop} +# appendChildren(element.content) +# ``` +# +# These two types, appending and printing, cannot be mixed. def executeCommand(source, element: Element, ctx: Context) -> List[Element]: mode = 'empty' @@ -64,6 +84,7 @@ def executeCommand(source, element: Element, ctx: Context) -> List[Element]: for e in l: appendChild(e) + import panflute as pf exec(source) if mode == 'text': diff --git a/context.py b/context.py index 89b8b9b..2f48823 100644 --- a/context.py +++ b/context.py @@ -1,12 +1,26 @@ -from panflute import Doc +from panflute import Doc, Div +from typing import Dict import os + +# This class is used to keep state while transforming the document using +# transform.py. For the context to be available to the html and TeX generators, +# individual keys must be manually assigned to the individual elements. This is +# done in transform.py. +# +# The context is also aware of its parent contexts and relevant data (such as +# metadata and commands) can be read from the closest parent context. Writing +# only happens to the current one. +# +# This class is basically an extension to panflute's doc, this is why metadata +# is read directly from it. class Context: - def __init__(self, doc: Doc, path: str, parent: 'Context'=None): + def __init__(self, doc: Doc, path: str, parent: 'Context'=None, trusted: bool=True): self.parent = parent self._commands = {} self.doc = doc + self.trusted = trusted self.path = path self.dir = os.path.dirname(path) if os.path.dirname(path) != "" else "." self.filename = os.path.basename(path) @@ -41,10 +55,10 @@ class Context: def set_flag(self, flag: str, val: bool): self.set_metadata("flags."+flag, val) - def unset_flag(self, flag): + def unset_flag(self, flag: str): self.unset_metadata("flags."+flag) - def get_metadata(self, key, simple=True, immediate=False): + def get_metadata(self, key: str, simple: bool=True, immediate: bool=False): value = self.doc.get_metadata(key, None, simple) if value is not None: return value @@ -53,7 +67,7 @@ class Context: else: return None - def set_metadata(self, key, value): + def set_metadata(self, key: str, value): if key == "language": print("WARN: Setting language this way doesn't propagate to TeX. Either use the Front Matter or specify it additionally using the \\languagexx macro.") meta = self.doc.metadata @@ -62,10 +76,19 @@ class Context: meta = meta[k] meta[key[-1]] = value - def unset_metadata(self, key): + def unset_metadata(self, key: str): meta = self.doc.metadata key = key.split(".") for k in key[:-1]: meta = meta[k] del meta.content[key[-1]] # A hack because MetaMap doesn't have a __delitem__ + +# This is a custom element which creates \begingroup \endgroup groups in TeX +# and also causes KaTeX math blocks to be isolated in a similar way. +# +# Whenever a new context is created, its content should be eclosed in a group and vice-versa. +class Group(Div): + def __init__(self, *args, metadata={}, **kwargs): + self.metadata = metadata + super().__init__(*args, **kwargs) diff --git a/formatitko.py b/formatitko.py index 94dfc04..5d50bd8 100755 --- a/formatitko.py +++ b/formatitko.py @@ -9,8 +9,7 @@ import os # Import local files from transform import transform from util import * -from context import Context -from group import Group +from context import Context, Group from katex import KatexClient from html import html from tex import tex @@ -18,26 +17,46 @@ from images import ImageProcessor from mj_show import show -parser = argparse.ArgumentParser() -parser.add_argument("-l", "--img-lookup-dirs", help="Image lookup directories. When processing images, the program will try to find the image in them first. By default contains the directory of each MarkDown file.", nargs="+", default=[]) -parser.add_argument("-p", "--img-public-dir", help="Directory to put processed images into. The program will not overwrite existing images.", nargs=1, default="public") -parser.add_argument("-i", "--img-web-path", help="Path where the processed images are available on the website.", nargs=1, default="/") -parser.add_argument("-w", "--output-html", help="The HTML file (for Web) to write into.", nargs=1, default="output.html") -parser.add_argument("-t", "--output-tex", help="The TEX file to write into.", nargs=1, default="output.tex") -parser.add_argument("input_filename", help="The MarkDown file to process.") +# Initialize command line arguments +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("-l", "--img-lookup-dirs", help="Image lookup directories. When processing images, the program will try to find the image in them first. Always looks for images in the same folder as the markdown file.", nargs="+", default=[]) +parser.add_argument("-p", "--img-public-dir", help="Directory to put processed images into. The program will not overwrite existing images.", default="public") +parser.add_argument("-i", "--img-web-path", help="Path where the processed images are available on the website.", default="/") +parser.add_argument("-w", "--output-html", help="The HTML file (for Web) to write into.", default="output.html") +parser.add_argument("-t", "--output-tex", help="The TEX file to write into.", default="output.tex") +parser.add_argument("input_filename", help="The markdown file to process.") +parser.add_argument("--debug", action='store_true') args = parser.parse_args() +# TODO: Accept path to unix socket for katexClient, then don't init our own, +# just connect to an existing one. For formátíking many files in a row. +# Use panflute to parse the input MD file doc = import_md(open(args.input_filename, "r").read()) +if args.debug: + print(show(doc)) + +# The language metadatum is important, so it's read before transformation and +# then attached to a group inside the Doc language = doc.get_metadata("language", None, True) context = Context(doc, args.input_filename) +# Transform the document. This includes all the fancy formatting this software does. doc = doc.walk(transform, context) +# Now wrap the document contents in a group, which is able to pop its language +# setting out to TeX doc.content = [Group(*doc.content, metadata={"language":language})] +# Initialize KaTeX client (this runs the node app and connects to a unix socket) katexClient = KatexClient() +# Initialize the image processor (this just keeps some basic state) imageProcessor = ImageProcessor(args.img_public_dir, args.img_web_path, *args.img_lookup_dirs) +# Generate HTML and TeX out of the transformed document open(args.output_html, "w").write(html(doc, katexClient, imageProcessor)) open(args.output_tex, "w").write(tex(doc, imageProcessor)) + +if args.debug: + print(show(doc)) + diff --git a/formatitko.tex b/formatitko.tex index a5de47d..e48135a 100644 --- a/formatitko.tex +++ b/formatitko.tex @@ -21,24 +21,19 @@ \fncount=1 \def\fnmark{\superscript{\the\fncount}} \def\fn#1{\footnote\fnmark{#1}\advance\fncount by 1} - \def\hA#1{{\parskip1em\settextsize{14}\bf #1}} \def\hB#1{{\parskip1em\settextsize{12}\bf #1}} \def\hC#1{{\parskip1em\settextsize{10}\bf #1}} +\def\hD#1{{\parskip1em\settextsize{10}\bi #1}} \def\hr{{\vskip5pt\hrule\vskip5pt}} \long\def\blockquote#1{\vskip\lineskip\vskip\parskip\hbox{\vrule\hskip5pt\vbox{#1}}} -\def\code#1{{\tt #1}} +\let\code\verbatim \let\codeblock\verbatim \def\subscript#1{\leavevmode\lower1pt\hbox{\fiverm#1}} \def\strikeout#1{FIXME: Strikeout not implemented} \def\underline#1{FIXME: Underline not implemented} \def\figure#1#2{\vskip5pt\centerline{#1}\centerline{#2}\vskip5pt} -\def\caption#1{{\it #1}} +\def\figcaption#1{{\it #1}} \let\image\putimage \def\languagecs{} % KSP should define this to \cze probably \def\languageen{} % KSP should define this to \eng probably -\def\table#1{#1} -\def\tablebody#1{#1} -\def\tablerow#1{#1} -\def\tablehead#1{#1} -\def\tablecell#1{#1} diff --git a/group.py b/group.py deleted file mode 100644 index 4e1a203..0000000 --- a/group.py +++ /dev/null @@ -1,8 +0,0 @@ -from panflute import Block -from typing import Dict - -class Group(Block): - def __init__(self, *args, identifier='', classes=[], attributes={}, metadata={}): - self._set_ica(identifier, classes, attributes) - self._set_content(args, Block) - self.metadata = metadata diff --git a/html.py b/html.py index 11dde85..17267ed 100644 --- a/html.py +++ b/html.py @@ -9,17 +9,21 @@ from whitespace import NBSP from transform import FQuoted from katex import KatexClient from util import inlinify -from group import Group +from context import Group from images import ImageProcessor def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, indent_str: str="\t") -> str: + # `only` attribute which makes transformed elements appear only in tex + # output or html output if hasattr(e, "attributes") and "only" in e.attributes and e.attributes["only"] != "html": return "" if isinstance(e, ListContainer): return ''.join([html(child, k, i, indent_level, indent_str) for child in e]) + # Bits from which the final element output is built at the end of this + # function. Most elements override this by returning their own output. tag = e.tag.lower() attributes = "" content_foot = "" @@ -28,6 +32,8 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind if isinstance(e, Str): return e.text.replace(" ", " ") + # Most elements fit the general template at the end of the function, just + # need their html tag specified. tags = { BulletList: "ul", Doc: "main", @@ -51,6 +57,7 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind if type(e) in tags: tag = tags[type(e)] + # These are also disabled in pandoc so they shouldn't appear in the AST at all. not_implemented = { Citation: True, Cite: True, @@ -61,6 +68,7 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind if type(e) in not_implemented: return f'' + # Elements which can be represented by a simple string simple_string = { NBSP: " ", Space: " ", @@ -78,44 +86,68 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind if hasattr(e, "classes") and len(e.classes) != 0: attributes += f' class="{" ".join(e.classes)}"' - # TODO: Pass attributes down to HTML too + # Attributes are only passed down manually, because we use them internally. + # Maybe this should be a blocklist instead of an allowlist? + # Overriding elements with their own returns if isinstance(e, CodeBlock): - if e.attributes["highlight"] == True or e.attributes["highlight"] == 'True': + if len(e.classes) > 0 and (e.attributes["highlight"] == True or e.attributes["highlight"] == 'True'): + # Syntax highlighting using pygments for cl in e.classes: try: lexer = get_lexer_by_name(cl) except ClassNotFound: continue break + else: + print(f"WARN: Syntax highligher does not have lexer for element with these classes: {e.classes}") formatter = HtmlFormatter(style=e.attributes["style"]) result = highlight(e.text, lexer, formatter) - style = formatter.get_style_defs(".highlight") - return f'{result}' - + return f'{result}' else: return f'
{e.text}
' - if isinstance(e, Figure): - content_foot = html(e.caption, k, i, indent_level+1, indent_str) - - if isinstance(e, Caption): - tag = "figcaption" + if isinstance(e, Doc): + formatter = HtmlFormatter(style=e.get_metadata("highlight-style") if e.get_metadata("highlight-style") is not None else "default") + content_head = f'' if isinstance(e, Image): url = e.url + + # Attributes → image processor args + additional_args = {} + if "file-width" in e.attributes: + additional_args["width"] = int(e.attributes["file-width"]) + if "file-height" in e.attributes: + additional_args["height"] = int(e.attributes["file-height"]) + if "file-quality" in e.attributes: + additional_args["quality"] = int(e.attributes["file-quality"]) + if "file-dpi" in e.attributes: + additional_args["dpi"] = int(e.attributes["file-dpi"]) + + # The directory of the current file, will also look for images there. source_dir = e.attributes["source_dir"] + _, ext = os.path.splitext(url) ext = ext[1:] + + # Conversions between various formats. if ext in ["svg", "png", "jpeg", "gif"]: - url = i.process_image(url, ext, source_dir) + # Even supported elements have to be 'converted' because the + # processing contains finding and moving them to the output + # directory. + url = i.process_image(url, ext, source_dir, **additional_args) elif ext in ["pdf", "epdf"]: - url = i.process_image(url, "png", source_dir, dpi=300) + if not "dpi" in additional_args: + additional_args["dpi"] = 300 + url = i.process_image(url, "png", source_dir, **additional_args) elif ext in ["jpg"]: - url = i.process_image(url, "jpeg", source_dir) + url = i.process_image(url, "jpeg", source_dir, **additional_args) else: - url = i.process_image(url, "png", source_dir) + url = i.process_image(url, "png", source_dir, **additional_args) + # Srcset generation - multiple alternative sizes of images browsers can + # choose from. _, ext = os.path.splitext(url) ext = ext[1:] srcset = [] @@ -123,14 +155,14 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind # This is inspired by @vojta001's blogPhoto shortcode he made for # patek.cz: # https://gitlab.com/patek-devs/patek.cz/-/blob/master/themes/patek/layouts/shortcodes/blogPhoto.html - width, height = i.get_image_size(url, [source_dir, i.public_dir]) + width, height = i.get_image_size(url, [i.public_dir]) sizes = [(640, 360, 85), (1280, 720, 85), (1920, 1080, 90)] # (widht, height, quality) for size in sizes: if width <= size[0] and height <= size[1]: srcset.append((f'{i.web_path}/{url}', f'{width}w')) break quality = size[2] if ext == "jpeg" else None - srcset.append((f'{i.web_path}/{i.process_image(url, ext, source_dir, width=size[0], height=size[1], quality=quality)}', f'{size[0]}w')) + srcset.append((f'{i.web_path}/{i.process_image(url, ext, i.public_dir, width=size[0], height=size[1], quality=quality)}', f'{size[0]}w')) url = i.web_path + "/" + url @@ -140,58 +172,17 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind else: return f'' - if isinstance(e, Header): - tag = "h"+str(e.level) - - if isinstance(e, Link): - tag = "a" - attributes += f' href="{e.url}"' - if e.title: - attributes += f' title="{e.title}"' - + # See https://pandoc.org/MANUAL.html#line-blocks if isinstance(e, LineItem): return indent_level*indent_str + html(e.content, k, i) + "
\n" + # Footnotes are placed into parentheses. (And not footnotes (This is how KSP did it before me)) if isinstance(e, Note): content_head = "(" content_foot = ")" if inlinify(e) is not None: return f' ({html(inlinify(e), k, i, 0, "")})' - if isinstance(e, OrderedList): - tag = "ol" - if e.start and e.start != 1: - attributes += f' start="{e.start}"' - html_styles = { - "Decimal": "1", - "LowerRoman": "i", - "UpperRoman:": "I", - "LowerAlpha": "a", - "UpperAlpha": "A" - } - if e.style and e.style != "DefaultStyle": - attributes += f' type="{html_styles[e.style]}"' - # FIXME: Delimeter styles - - if isinstance(e, Table): - content_head = html(e.head, k, i, indent_level+1, indent_str) - content_foot = html(e.foot, k, i, indent_level+1, indent_str) - # FIXME: Fancy pandoc tables, using colspec - - if isinstance(e, TableCell): - tag = "td" - if e.colspan != 1: - attributes += f' colspan="{e.colspan}"' - if e.rowspan != 1: - attributes += f' rowspan="{e.rowspan}"' - aligns = { - "AlignLeft": "left", - "AlignRight": "right", - "AlignCenter": "center" - } - if e.alignment and e.alignment != "AlignDefault": - attributes += f' style="text-align: {aligns[e.alignment]}"' - if isinstance(e, FQuoted): if e.style == "cs": if e.quote_type == "SingleQuote": @@ -222,9 +213,6 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind "DisplayMath": True, "InlineMath": False } - # FIXME: Currently, all bits of math are isolated from each other, this - # means that \defs and and alike work only inside a single math block - # and are forgotten in the next one. return indent_level*indent_str + k.render(e.text, {"displayMode": formats[e.format]}) if isinstance(e, RawInline): @@ -239,6 +227,62 @@ def html(e: Element, k: KatexClient, i: ImageProcessor, indent_level: int=0, ind else: return "" + + # Non-overriding elements, they get generated using the template at the end + # of this function + if isinstance(e, Header): + tag = "h"+str(e.level) + + if isinstance(e, Figure): + content_foot = html(e.caption, k, i, indent_level+1, indent_str) + + if isinstance(e, Caption): + tag = "figcaption" + + if isinstance(e, Link): + tag = "a" + attributes += f' href="{e.url}"' + if e.title: + attributes += f' title="{e.title}"' + + if isinstance(e, OrderedList): + tag = "ol" + if e.start and e.start != 1: + attributes += f' start="{e.start}"' + html_styles = { + "Decimal": "1", + "LowerRoman": "i", + "UpperRoman:": "I", + "LowerAlpha": "a", + "UpperAlpha": "A" + } + if e.style and e.style != "DefaultStyle": + attributes += f' type="{html_styles[e.style]}"' + # FIXME: Delimeter styles + + if isinstance(e, Table): + content_head = html(e.head, k, i, indent_level+1, indent_str) + content_foot = html(e.foot, k, i, indent_level+1, indent_str) + # FIXME: Fancy pandoc tables, using colspec + + if isinstance(e, TableCell): + tag = "td" + if e.colspan != 1: + attributes += f' colspan="{e.colspan}"' + if e.rowspan != 1: + attributes += f' rowspan="{e.rowspan}"' + aligns = { + "AlignLeft": "left", + "AlignRight": "right", + "AlignCenter": "center" + } + if e.alignment and e.alignment != "AlignDefault": + attributes += f' style="text-align: {aligns[e.alignment]}"' + + # The default which all non-overriding elements get generated by. This + # includes elements, which were not explicitly mentioned in this function, + # e. g. Strong + if isinstance(e, Inline): return f'<{tag}{attributes}>{content_head}{html(e.content, k, i, 0, "") if hasattr(e, "_content") else ""}{e.text if hasattr(e, "text") else ""}{content_foot}' diff --git a/images.py b/images.py index 132912a..aee0b4a 100644 --- a/images.py +++ b/images.py @@ -2,6 +2,7 @@ from typing import List import os import shutil import subprocess +from PIL import Image class ImageProcessor: def __init__(self, public_dir: str, web_path: str, *lookup_dirs: List[str]): @@ -18,7 +19,8 @@ class ImageProcessor: full_path = self.find_image(input_filename, [source_dir]) if full_path is None: raise FileNotFoundError(f'Image {input_filename} not found.') - + + # Generate filename from arguments suffix = "" geometry = None if width is not None or height is not None: @@ -28,24 +30,32 @@ class ImageProcessor: suffix += f'_q{quality}' target_name = base+suffix+"."+format target_path = self.public_dir + "/" + target_name - + + # Only regenerate if the file doesn't already exist. if not os.path.isfile(target_path): - if (((ext == format and width) + + # If the format is the same or it is just a different extension for + # the same format, just copy it. + if (((ext == format) or (ext == "epdf" and format == "pdf") or (ext == "jpg" and format == "jpeg")) and width is None and height is None and quality is None and dpi is None): shutil.copyfile(full_path, target_path) + # Try to find the converted filename in lookup_dirs, if you find + # it, don't convert, just copy. elif self.find_image(target_name, [source_dir]): shutil.copyfile(self.find_image(target_name, [source_dir]), target_path) + # Convert SVGs using inkscape elif ext == "svg": width_arg = ['--export-width', str(width)] if width is not None else [] height_arg = ['--export-height', str(height)] if height is not None else [] dpi_arg = ['--export-dpi', str(dpi)] if dpi is not None else [] if subprocess.run(['inkscape', full_path, '-o', target_path, *width_arg, *height_arg, *dpi_arg]).returncode != 0: raise Exception(f"Could not convert '{full_path}' to '{format}'") - + + # Convert everything else using ImageMagick. else: resize_arg = ['-resize', str(geometry)] if geometry is not None else [] density_arg = ['-density', str(dpi)] if dpi is not None else [] @@ -60,7 +70,8 @@ class ImageProcessor: full_path = self.find_image(input_filename, additional_dirs) if full_path is None: raise FileNotFoundError(f'Image {input_filename} not found.') - return (int(x) for x in subprocess.run(['convert', full_path, '-print', '%w %h\\n', '/dev/null'], capture_output=True).stdout.split(b" ")) + # Getting image size using ImageMagick is slow. VERY + return Image.open(full_path).size def find_image(self, input_filename: str, additional_dirs: List[str]=[]) -> str: diff --git a/katex-server/index.mjs b/katex-server/index.mjs index 141e5a0..6beddf2 100644 --- a/katex-server/index.mjs +++ b/katex-server/index.mjs @@ -78,10 +78,16 @@ function socketWrite(socket, data) { async function handleClient(client) { const rl = readline.createInterface({ input: client }) + /* Added by GS: A stack of katex's `macros` objects, each group inherits + * the one from the parent group and can add its own stuff without + * affecting the parent. + */ const macroStack = [{}] for await (const line of rl) { try { + // The custom commands for pushing and popping the macro stack. if (line === "begingroup") { + // Copy the current state of macros and push it onto the stack. macroStack.push({...macroStack.slice(-1)[0]}) continue } else if (line === "endgroup") { @@ -92,12 +98,16 @@ async function handleClient(client) { const results = [] for (const input of query.formulas) { const options = input.options ?? query.options ?? defaultOptions + // Add macros from the macros option if (options.macros) { for (const macro of Object.keys(options.macros)) { macroStack.slice(-1)[macro] = options.macros[macro] } } options.macros = macroStack.slice(-1)[0] + // Enforce globalGroup option, katex then saves created macros + // into the options.macros object. + options.globalGroup = true try { const html = katex.renderToString(input.tex, options) results.push({ html }) diff --git a/katex.py b/katex.py index 7879e7e..c0aadea 100644 --- a/katex.py +++ b/katex.py @@ -4,6 +4,7 @@ import tempfile import json import os from typing import Dict +import time class KatexError(Exception): @@ -11,26 +12,37 @@ class KatexError(Exception): class KatexClient: def __init__(self): - self._client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + # Create temporary directory for socket self._temp_dir = tempfile.TemporaryDirectory(prefix='formatitko') self._socket_file = self._temp_dir.name + "/katex-socket" + self._server_process = subprocess.Popen(["node", os.path.dirname(os.path.realpath(__file__)) + "/katex-server/index.mjs", self._socket_file]) + + self._client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + + # Wait for the node program to create the socket file while not os.path.exists(self._socket_file): - pass + time.sleep(0.01) + + # Wait for the node program to start accepting connections while True: try: self._client.connect(self._socket_file) + time.sleep(0.01) except ConnectionRefusedError: continue break def render(self, tex: str, options: Dict={}): - options["globalGroup"] = True + # Send formulas to translate self._client.sendall((json.dumps({"formulas":[{"tex":tex}], "options":options})+"\n").encode("utf-8")) - data = self._client.recv(1024) + + # Receive response + data = self._client.recv(4096) while data[-1] != 0x0a: data += self._client.recv(128) response = json.loads(data) + if "error" in response: raise Exception(response["error"]) if "error" in response["results"][0]: @@ -38,6 +50,7 @@ class KatexClient: else: return response["results"][0]["html"] + # Special commands implemented in the JS file for grouping defs together. def begingroup(self): self._client.sendall("begingroup\n".encode("utf-8")) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..250abf8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +Pygments==2.14.0 +panflute==2.3.0 +fontTools==4.38.0 +Pillow==9.4.0 diff --git a/test.md b/test.md deleted file mode 100644 index 692aaa1..0000000 --- a/test.md +++ /dev/null @@ -1,162 +0,0 @@ ---- -title: 'Wooooo a title' -subtitle: 'A subtitle' -are_we_there_yet: False -language: "en" ---- -[#test-import.md]{} - -# Hello world! - -This is an *example* **yay**! - -This is *very **strongly** emphasised* - -Příliš žluťoučký kůň pěl dábelské ódy. *Příliš žluťoučký kůň pěl dábelské ódy.* **Příliš žluťoučký kůň pěl dábelské ódy.** ***Příliš žluťoučký kůň pěl dábelské ódy.*** - - -:::{partial=test-partial.md} -::: - -:::{if=cat} -This should only be shown to cats -::: - - -```python {.run} -ctx.set_flag("cat", True) -``` - -```python {.run} -println(f"The main document's title is '{ctx.get_metadata('title')}'") -ctx.set_metadata("a", {}) -ctx.set_metadata("a.b", {}) -ctx.set_metadata("a.b.c", "Bruh **bruh** bruh") -``` - -```python {style=native} -def bruh(no): - wat -``` - -Inline `code` - -::::{if=cat} -This should only be shown to cats the second time -:::: - -# [$are_we_there_yet]{} - -![This is a figure, go figure...](/tmp/logo.pdf) - -![This is a figure, go figure...](/tmp/logo.jpg){width=10em} - -![This is a figure, go figure...](/tmp/logo.png){width=10em} - -![Fakt epesní reproduktor](/tmp/reproduktor.jpeg){width=10em} - -```python {.run} -ctx.set_metadata("language", "cs") -``` -[!opendatatask]{} -```python {.run} -ctx.set_metadata("language","en") -``` -[This too!]{if=cat} - -[What]{.co} - -[An inline command with contents and **bold** and another [!nop]{} inside!]{c=nop} - -[!nop]{a=b} - -> OOO a blockquote mate init -> ->> Nesting?? ->> Woah - -A non-breakable space bro - -A lot of spaces - -A text with some inline math: $\sum_{i=1}^nn^2$. Plus some display math: - -A link with the link in the link: - -H~2~O is a liquid. 2^10^ is 1024. - -[Underline]{.underline} - -:::{only=html} -$$ -\def\eqalign#1{\begin{align*}#1\end{align*}} -$$ -::: - -$$ -\eqalign{ - 2 x_2 + 6 x_3 &= 14 \cr - x_1 - 3 x_2 + 2 x_3 &= 5 \cr - -x_1 + 4 x_2 + \phantom{1} x_3 &= 2 -} -$$ - -:::{partial=test-partial.md} -::: - ---- - -This should be seen by all.^[This is a footnote] - -| Matematicko-fyzikální fakulta University Karlovy -| Malostranské nám. 2/25 -| 118 00 Praha 1 - -More footnotes.^[I am a foot] - -To Do: - -- buy eggs -- buy milk -- ??? -- profit - - also create sublists preferrably - -1. Woah -2. Wooo -3. no - -4) WOO - -``` {=html} -
- -
This is indeed a video
-
-``` - -#. brum -#. BRUHHH -#. woah - -i. bro -ii. wym bro - - -+---------------------+-----------------------+ -| Location | Temperature 1961-1990 | -| | in degree Celsius | -+---------------------+-------+-------+-------+ -| | min | mean | max | -+=====================+=======+=======+======:+ -| Antarctica | -89.2 | N/A | 19.8 | -+---------------------+-------+-------+-------+ -| Earth | -89.2 | 14 | 56.7 | -+---------------------+-------+-------+-------+ - -------- ------ ---------- ------- - 12 12 12 12 - 123 123 123 123 - 1 1 1 1 -------- ------ ---------- ------- - diff --git a/test/1px.png b/test/1px.png new file mode 100644 index 0000000..4edadd3 Binary files /dev/null and b/test/1px.png differ diff --git a/test/Makefile b/test/Makefile index 455c41e..58842e5 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,10 +1,10 @@ -all: test.pdf public/test.html +all: test.pdf public/index.html output.tex output.html: ../formatitko.py test.md -public/test.html: output.html - cat test-top.html output.html > public/test.html +public/index.html: output.html + cat test-top.html output.html > public/index.html test.tex: output.tex cat test-top.tex output.tex > test.tex diff --git a/test/empty.md b/test/empty.md new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/test/empty.md @@ -0,0 +1 @@ + diff --git a/test/test-files/evil.md b/test/test-files/evil.md new file mode 100644 index 0000000..a27c03d --- /dev/null +++ b/test/test-files/evil.md @@ -0,0 +1,8 @@ +--- +title: "I am a little evil md file hehe" +--- +```python {.run} +import sys +sys.exit(666) +``` +I am very innocent wym bro :( \ No newline at end of file diff --git a/test/test-files/test-partial.md b/test/test-files/test-partial.md index bdf1c1e..6450237 100644 --- a/test/test-files/test-partial.md +++ b/test/test-files/test-partial.md @@ -1,6 +1,5 @@ --- title: A subfile! -language: "cs" --- I am a little piece of content @@ -23,21 +22,18 @@ println() println(f"The subdocument's subtitle is \n\n## {ctx.get_metadata('subtitle')}") ``` -``` {.python .run} -ctx.set_metadata("language", "cs") -``` - +```markdown {.group} +--- +language: "cs" +--- Tak toto je "v prádelně" pánové! - -``` {.python .run} -ctx.set_metadata("language", "en") ``` +```markdown {.group} +--- +language: "en" +--- This is "in a laundry room" gentlemen! - - -``` {.python .run} -ctx.unset_metadata("language") ``` I am a duck. @@ -63,5 +59,5 @@ $$ ![Fakt epesní reproduktor](reproduktor.jpeg){width=10em} -![Fakt epesní reproduktor](reproduktor.png){width=10em} +![Fakt epesní reproduktor](reproduktor.png "Hodně rozpixelovaný obrázek reproduktoru"){width=10em file-width=1000} diff --git a/test/test.md b/test/test.md index b82831e..39b587e 100644 --- a/test/test.md +++ b/test/test.md @@ -14,6 +14,8 @@ This is *very **strongly** emphasised* Příliš žluťoučký kůň pěl dábelské ódy. *Příliš žluťoučký kůň pěl dábelské ódy.* **Příliš žluťoučký kůň pěl dábelské ódy.** ***Příliš žluťoučký kůň pěl dábelské ódy.*** +:::{partial=test-files/evil.md untrusted=True} +::: :::{partial=test-files/test-partial.md} ::: @@ -22,7 +24,6 @@ Příliš žluťoučký kůň pěl dábelské ódy. *Příliš žluťoučký ků This should only be shown to cats ::: - ```python {.run} ctx.set_flag("cat", True) ``` @@ -47,13 +48,18 @@ This should only be shown to cats the second time # [$are_we_there_yet]{} -```python {.run} -ctx.set_metadata("language", "cs") -``` +```markdown {.group} +--- +language: cs +--- +V​ pravém jízdním bruhu. +V pravém jízdním bruhu. +V pravém jízdním bruhu. +V pravém jízdním bruhu. + [!opendatatask]{} -```python {.run} -ctx.set_metadata("language","en") ``` + [This too!]{if=cat} [What]{.co} diff --git a/tex.py b/tex.py index fb97d8f..238255e 100644 --- a/tex.py +++ b/tex.py @@ -4,18 +4,22 @@ import os from whitespace import NBSP from transform import FQuoted from util import inlinify -from group import Group +from context import Group from images import ImageProcessor # Heavily inspired by: git://git.ucw.cz/labsconf2022.git def tex(e: Element, i: ImageProcessor, indent_level: int=0, indent_str: str="\t") -> str: - + + # `only` attribute which makes transformed elements appear only in tex + # output or html output if hasattr(e, "attributes") and "only" in e.attributes and e.attributes["only"] != "tex": return "" if isinstance(e, ListContainer): return ''.join([tex(child, i, indent_level, indent_str) for child in e]) + # Bits from which the final element output is built at the end of this + # function. Most elements override this by returning their own output. content_foot = "" content_head = "" @@ -31,7 +35,7 @@ def tex(e: Element, i: ImageProcessor, indent_level: int=0, indent_str: str="\t" if type(e) in tags: tag = tags[type(e)] - + # These are also disabled in pandoc so they shouldn't appear in the AST at all. not_implemented = { Citation: True, Cite: True, @@ -41,7 +45,8 @@ def tex(e: Element, i: ImageProcessor, indent_level: int=0, indent_str: str="\t" } if type(e) in not_implemented: return f'% FIXME: {type(e)}s not implemented \n' - + + # Elements which can be represented by a simple string simple_string = { NBSP: "~", Space: " ", @@ -53,12 +58,61 @@ def tex(e: Element, i: ImageProcessor, indent_level: int=0, indent_str: str="\t" if type(e) in simple_string: return simple_string[type(e)] + # Simplest basic elements if isinstance(e, Str): - return e.text.replace(" ", "~").replace(" ", "~") + return e.text.replace(" ", "~") if isinstance(e, Para): return tex(e.content, i, 0, "")+"\n\n" + if isinstance(e, Span) or isinstance(e, Plain): + return tex(e.content, i, 0, "") + + # Overriding elements with their own returns + if isinstance(e, Image): + url = e.url + + # Attributes → image processor args + additional_args = {} + if "file-width" in e.attributes: + additional_args["width"] = int(e.attributes["file-width"]) + if "file-height" in e.attributes: + additional_args["height"] = int(e.attributes["file-height"]) + if "file-quality" in e.attributes: + additional_args["quality"] = int(e.attributes["file-quality"]) + if "file-dpi" in e.attributes: + additional_args["dpi"] = int(e.attributes["file-dpi"]) + + # The directory of the current file, will also look for images there. + source_dir = e.attributes["source_dir"] + + _, ext = os.path.splitext(url) + ext = ext[1:] + + # Conversions between various formats. + if ext in ["pdf", "png", "jpeg"]: + # Even supported elements have to be 'converted' because the + # processing contains finding and moving them to the output + # directory. + url = i.process_image(url, ext, source_dir, relative=False, **additional_args) + elif ext in ["svg"]: + url = i.process_image(url, "pdf", source_dir, relative=False, **additional_args) + elif ext in ["epdf"]: + url = i.process_image(url, "pdf", source_dir, relative=False, **additional_args) + elif ext in ["jpg"]: + url = i.process_image(url, "jpeg", source_dir, relative=False, **additional_args) + else: + url = i.process_image(url, "pdf", source_dir, relative=False, **additional_args) + + width = "" + if "width" in e.attributes: + width = e.attributes["width"] + # 50% → 0.5\hsize + if e.attributes["width"][-1] == "%": + width = str(int(e.attributes["width"][:-1])/100) + "\\hsize" + width = "width " + width + return f'\\image{{{width}}}{{{url}}}' + if isinstance(e, FQuoted): if e.style == "cs": if e.quote_type == "SingleQuote": @@ -78,81 +132,24 @@ def tex(e: Element, i: ImageProcessor, indent_level: int=0, indent_str: str="\t" else: return f'"{tex(e.content, i, 0, "")}"' - if isinstance(e, BulletList): - tag = "list" - open = "" - arguments = "{o}" - close = "\\endlist" - - if isinstance(e, OrderedList): - tag = "list" - open = "" - styles = { - "DefaultStyle": "n", - "Decimal": "n", - "LowerRoman": "i", - "UpperRoman:": "I", - "LowerAlpha": "a", - "UpperAlpha": "A" - } - style = styles[e.style] - delimiters = { - "DefaultDelim": f"{style}.", - "Period": f"{style}.", - "OneParen": f"{style})", - "TwoParens": f"({style})" - } - style = delimiters[e.delimiter] - arguments = f"{{{style}}}" - close = "\\endlist" - # FIXME: Starting number of list - - if isinstance(e, Image): - url = e.url - source_dir = e.attributes["source_dir"] - _, ext = os.path.splitext(url) - ext = ext[1:] - if ext in ["pdf", "png", "jpeg"]: - url = i.process_image(url, ext, source_dir, relative=False) - elif ext in ["svg"]: - url = i.process_image(url, "pdf", source_dir, relative=False) - elif ext in ["epdf"]: - url = i.process_image(url, "pdf", source_dir, relative=False) - elif ext in ["jpg"]: - url = i.process_image(url, "jpeg", source_dir, relative=False) - else: - url = i.process_image(url, "pdf", source_dir, relative=False) - width = "" - if "width" in e.attributes: - width = e.attributes["width"] - if e.attributes["width"][-1] == "%": - width = str(int(e.attributes["width"][:-1])/100) + "\\hsize" - width = "width " + width - return f'\\image{{{width}}}{{{url}}}' + if isinstance(e, Code): + return f'\\verb`{e.text.replace("`", "backtick")}`' if isinstance(e, Figure): return f'\\figure{{{tex(e.content, i, indent_level+1, indent_str)}}}{{{tex(e.caption, i, indent_level+1, indent_str)}}}\n\n' - + + # Figure caption if isinstance(e, Caption): if inlinify(e) is not None: - return f'\\caption{{{tex(e.content, i, 0, "")}}}' - - if isinstance(e, ListItem): - tag = ":" - - if isinstance(e, Link): - if len(e.content) == 1 and isinstance(e.content[0], Str) and e.content[0].text == e.url: - tag = "url" - else: - tag = "linkurl" - arguments = f'{{{e.url}}}' + return f'\\figcaption{{{tex(e.content, i, 0, "")}}}' if isinstance(e, Math): if e.format == "DisplayMath": return f'$${e.text}$$\n' else: return f'${e.text}$' - + + # Footnote if isinstance(e, Note): tag = "fn" if inlinify(e) is not None: @@ -189,27 +186,71 @@ def tex(e: Element, i: ImageProcessor, indent_level: int=0, indent_str: str="\t" else: return "" - if isinstance(e, Span) or isinstance(e, Plain): - return tex(e.content, i, 0, "") + # See https://pandoc.org/MANUAL.html#line-blocks + if isinstance(e, LineBlock): + return f'{tex(e.content, i, indent_level+1, indent_str)}\n' if isinstance(e, LineItem): return tex(e.content, i, 0, "") + ("\\\\\n" if e.next else "\n") - if isinstance(e, LineBlock): - return f'{tex(e.content, i, indent_level+1, indent_str)}\n' + if type(e) is Div: + return f'{tex(e.content, i, indent_level+1, indent_str)}' + + if isinstance(e, Doc): + return tex(e.content, i, indent_level, indent_str)+"\n\\bye" # Is having the \bye a bad idea here? - if isinstance(e, Group): + + # Non-overriding elements, they get generated using the template at the end + # of this function + if isinstance(e, BulletList): + tag = "list" + open = "" + arguments = "{o}" + close = "\\endlist" + + elif isinstance(e, OrderedList): + tag = "list" + open = "" + styles = { + "DefaultStyle": "n", + "Decimal": "n", + "LowerRoman": "i", + "UpperRoman:": "I", + "LowerAlpha": "a", + "UpperAlpha": "A" + } + style = styles[e.style] + delimiters = { + "DefaultDelim": f"{style}.", + "Period": f"{style}.", + "OneParen": f"{style})", + "TwoParens": f"({style})" + } + style = delimiters[e.delimiter] + arguments = f"{{{style}}}" + close = "\\endlist" + # FIXME: Starting number of list + + elif isinstance(e, ListItem): + tag = ":" + + elif isinstance(e, Link): + if len(e.content) == 1 and isinstance(e.content[0], Str) and e.content[0].text == e.url: + tag = "url" + else: + tag = "linkurl" + arguments = f'{{{e.url}}}' + + elif isinstance(e, Group): tag = "begingroup" open = "" if "language" in e.metadata and e.metadata["language"] is not None: open = "\\language"+e.metadata["language"] close = "\\endgroup" - - if isinstance(e, Div): - return f'{tex(e.content, i, indent_level+1, indent_str)}' - - if isinstance(e, Doc): - return tex(e.content, i, indent_level, indent_str)+"\n\\bye" + + # The default which all non-overriding elements get generated by. This + # includes elements, which were not explicitly mentioned in this function, + # e. g. Strong, Emph... if isinstance(e, Inline): return f'\\{tag}{arguments}{open}{content_head}{tex(e.content, i, 0, "") if hasattr(e, "_content") else ""}{e.text if hasattr(e, "text") else ""}{content_foot}{close}' diff --git a/transform.py b/transform.py index 56843fa..b13f565 100644 --- a/transform.py +++ b/transform.py @@ -6,16 +6,22 @@ from whitespace import * from command import * from util import * from context import * -from group import Group + +# This is a small extension to the Quoted panflute elements which allows to +# have language-aware quotation marks. class FQuoted(Quoted): def __init__(self, *args, **kwargs): self.style = kwargs["style"] del kwargs["style"] super().__init__(*args, **kwargs) -def transform(e: Element, c: Context) -> Element: # Returns next sibling element to transform - """Transform the AST, making format-agnostic changes.""" + +# This is where tha magic happens. This function transforms a single element, +# to transform the entire tree, panflute's walk should be used. +def transform(e: Element, c: Context) -> Element: + + # Determine if this space should be non-breakable. See whitespace.py. if isinstance(e, Whitespace) and bavlna(e, c): e = NBSP() @@ -29,24 +35,48 @@ def transform(e: Element, c: Context) -> Element: # Returns next sibling element if c.is_flag_set(e.attributes["ifn"]): return nullify(e) - # `c` attribute. Execute a command with the name saved in this attribute. + # There are multiple ways to call a command so we turn it into a + # unified element first and then call it at the end. This handles the + # []{c=commandname} and + # :::{c=commandname} + # ::: + # syntax. if (isinstance(e, Div) or isinstance(e, Span)) and "c" in e.attributes: if isinstance(e, Div): e = BlockCommand(*e.content, identifier=e.identifier, classes=e.classes, attributes=e.attributes) else: e = InlineCommand(*e.content, identifier=e.identifier, classes=e.classes, attributes=e.attributes) - # `partial` attribute. - # This is for including content from files with their own flags and - # commands without affecting the state of the current document. - if (isinstance(e, Div)) and "partial" in e.attributes: - includedDoc = import_md(open(c.dir + "/" + e.attributes["partial"], "r").read()) - nContext = Context(includedDoc, e.attributes["partial"], c) - language = includedDoc.get_metadata("language") - includedDoc = includedDoc.walk(transform, nContext) - e = Group(*includedDoc.content, metadata={"language": language}) - - + # Isolated subdocuments using Group and a different Context. Can be + # separate files (using attribute `partial`) or be inline using the + # following syntax: + # ```markdown {.group} + # * file content * + # ``` + # Both can contain their own metadata in a FrontMatter (YAML header) + if (isinstance(e, Div) and "partial" in e.attributes)\ + or (isinstance(e, CodeBlock) and "markdown" in e.classes and "group" in e.classes): + if isinstance(e, Div): + text = open(c.dir + "/" + e.attributes["partial"], "r").read() + path = c.dir + "/" + e.attributes["partial"] + else: + text = e.text + path = c.path + if "type" in e.attributes and e.attributes["type"] in ["tex", "html"]: + e = RawBlock(text, e.attributes["type"]) + else: + includedDoc = import_md(text) + trusted = True + if "untrusted" in e.attributes and (e.attributes["untrusted"] == True or e.attributes["untrusted"] == 'True'): + trusted = False + if not c.trusted: + trusted = False + nContext = Context(includedDoc, path, c, trusted=trusted) + language = includedDoc.get_metadata("language") + includedDoc = includedDoc.walk(transform, nContext) + e = Group(*includedDoc.content, metadata={"language": language}) + + # Transform panflute's Quoted to custom FQuoted, see above. if isinstance(e, Quoted): quote_styles = { "cs": "cs", @@ -55,45 +85,66 @@ def transform(e: Element, c: Context) -> Element: # Returns next sibling element None: None } e = FQuoted(*e.content, quote_type=e.quote_type, style=quote_styles[c.get_metadata("language")]) - + if isinstance(e, Image): + # Pass down the directory of the current source file for finding image + # files. e.attributes["source_dir"] = c.dir + # Pass down "no-srcset" metadatum as attribute down to images. if not "no-srcset" in e.attributes: e.attributes["no-srcset"] = c.get_metadata("no-srcset") if c.get_metadata("no-srcset") is not None else False - # Execute python code inside source code block + # Pass down metadata 'highlight' and 'highlight_style' as attribute to CodeBlocks + if isinstance(e, CodeBlock): + if not "highlight" in e.attributes: + e.attributes["highlight"] = c.get_metadata("highlight") if c.get_metadata("highlight") is not None else True + if not "style" in e.attributes: + e.attributes["style"] = c.get_metadata("highlight-style") if c.get_metadata("highlight-style") is not None else "default" + e.attributes["noclasses"] = False + else: + e.attributes["noclasses"] = True + + # Execute python code inside source code block. Works the same as commands. + # Syntax: + # ```python {.run} + # print("woo") + # ``` if isinstance(e, CodeBlock) and hasattr(e, "classes") and "python" in e.classes and "run" in e.classes: + if not c.trusted: + return nullify(e) e = Div(*executeCommand(e.text, None, c)) e = e.walk(transform, c) - ## Command defines - # possible TODO: def/longdef? + # Command defines for calling using BlockCommand and InlineCommand. If + # redefine is used instead of define, the program doesn't check if the + # command already exists. + # Syntax: + # ```python {define=commandname} + # print(wooo) + # ``` if isinstance(e, CodeBlock) and hasattr(e, "classes") and "python" in e.classes and hasattr(e, "attributes")\ and ("define" in e.attributes or "redefine" in e.attributes): + if not c.trusted: + return nullify(e) e = handle_command_define(e, c) - # Pass down metadata 'highlight' and 'highlight_style' as attribute to CodeBlocks - if isinstance(e, CodeBlock): - if not "highlight" in e.attributes: - e.attributes["highlight"] = c.get_metadata("highlight") if c.get_metadata("highlight") is not None else True - if not "style" in e.attributes: - e.attributes["style"] = c.get_metadata("highlight_style") if c.get_metadata("highlight_style") is not None else "default" - ## Shorthands + # Shorter (and sometimes the only) forms of certain features if isinstance(e, Span) and len(e.content) == 1 and isinstance(e.content[0], Str): ## Handle special command shorthand [!commandname]{} if re.match(r"^![\w]+$", e.content[0].text): e = InlineCommand(identifier=e.identifier, classes=e.classes, attributes={**e.attributes, "c": e.content[0].text[1:]}) ## Handle import [#path/file.md]{} - # This is the exact opposite of include. We take the commands - # and flags but drop the content. + # This is the exact opposite of partials. We take the commands, flags + # and metadata but drop the content. elif re.match(r"^#.+$", e.content[0].text): importedDoc = import_md(open(c.dir + "/" + e.content[0].text[1:], "r").read()) importedDoc.walk(transform, c) return nullify(e) - ## Handle metadata print [$something.something]{} + ## Handle metadata print [$key1.key2]{} + # This is a shorthand for just printing the content of some metadata. elif re.match(r"^\$[\w.]+$", e.content[0].text): val = c.get_metadata(e.content[0].text[1:], False) if isinstance(val, MetaInlines): @@ -107,8 +158,9 @@ def transform(e: Element, c: Context) -> Element: # Returns next sibling element raise TypeError(f"Cannot print value of metadatum '{e.content[0].text[1:]}' of type '{type(val)}'") ## Execute commands - # panflute's walk transforms the children first, then the root element, so - # the content of the element the command receives is already transformed. + # panflute's walk function transforms the children first, then the root + # element, so the content the command receives is already transformed. + # The output from the command is then transformed manually again. if isinstance(e, Command): if not c.get_command(e.attributes["c"]): raise NameError(f"Command not defined '{e.attributes['c']}'.") diff --git a/util.py b/util.py index 683c519..aa14151 100644 --- a/util.py +++ b/util.py @@ -1,23 +1,25 @@ from panflute import Element, Block, Inline, Null, Str, Doc, convert_text, Para, Plain import re +# It sometimes happens that an element contains a single paragraph or even a +# single plaintext line. It can be sometimes useful to extract this single +# paragraph, which is inline. def inlinify(e: Element) -> Element: if len(e.content) == 1 and (isinstance(e.content[0], Para) or isinstance(e.content[0], Plain)): return e.content[0].content -def replaceEl(e: Element, r: Element) -> Element: - parent = e.parent - parent.content[e.index] = r - r.parent = parent - return r -def deleteEl(e: Element): - del e.parent.content[e.index] - +# In transform, inline elements cannot be replaced with Block ones and also +# cannot be removed from the tree entirely, because that would mess up the +# iteration process through the tree. We replace them with null elements +# instead which never make it to the output. def nullify(e: Element): if isinstance(e, Inline): return Str("") elif isinstance(e, Block): return Null() +# A helper function to import markdown using panflute (which calls pandoc). If +# we ever want to disable or enable some of panflute's markdown extensions, +# this is the place to do it. def import_md(s: str, standalone: bool=True) -> Doc: - return convert_text(s, standalone=standalone) + return convert_text(s, standalone=standalone, input_format="markdown-definition_lists-citations") diff --git a/whitespace.py b/whitespace.py index 28a5336..928f94b 100644 --- a/whitespace.py +++ b/whitespace.py @@ -9,11 +9,14 @@ Whitespace = Union[Space,SoftBreak] class NBSP(Space): pass +# This function tries to determine if a space should be non-breaking. It is +# language-aware and tries to be sort-of smart about its decisions. def bavlna(e: Whitespace, c: Context) -> bool: - """Determine if given piece of whitespace should be non-breakable.""" - - + if c.get_metadata("language") == "cs": + # Add no-break space after single letter prepositions and conjunctions. + # Also tries to find them inside elements, for instance + # `V [odevzdávátku]()` should get correctly detected. prev = e.prev if isinstance(e.prev, Str) else (e.prev.content[-1] if hasattr(e.prev, "content") and len(e.prev.content) != 0 else None) next = e.next if isinstance(e.next, Str) else (e.next.content[0] if hasattr(e.next, "content") and len(e.next.content) != 0 else None) if isinstance(prev, Str) and isinstance(next, Str): @@ -21,6 +24,7 @@ def bavlna(e: Whitespace, c: Context) -> bool: return True if isinstance(e.prev, Str) and isinstance(e.next, Str): + # Add no-break space between numbers or numbers and operators. prevC = e.prev.text[-1] nextC = e.next.text[0] numbers = ["0123456789"] @@ -33,6 +37,7 @@ def bavlna(e: Whitespace, c: Context) -> bool: return True if isinstance(e.prev, Math) or isinstance(e.next, Math): + # Add no-break spaces around TeX math. return True