space-workshop/extensions/slide_meta/parser.py
Kai Vogelgesang 489ddd75b9
Init
2025-09-18 18:33:58 +02:00

219 lines
7.5 KiB
Python

import json
from myst_nb.parser import *
SPHINX_LOGGER = logging.getLogger(__name__)
# Mostly taken and slightly adapted from https://github.com/executablebooks/MyST-NB/blob/v0.13.2/myst_nb/parser.py
class PatchedNotebookParser(MystParser):
"""Docutils parser for Markedly Structured Text (MyST) and Jupyter Notebooks."""
supported = ("myst-nb",)
translate_section_name = None
config_section = "myst-nb parser"
config_section_dependencies = ("parsers",)
def parse(
self, inputstring: str, document: nodes.document, renderer: str = "sphinx"
) -> None:
self.reporter = document.reporter
self.env = document.settings.env # type: BuildEnvironment
converter = get_nb_converter(
self.env.doc2path(self.env.docname, True),
self.env,
inputstring.splitlines(keepends=True),
)
if converter is None:
# Read the notebook as a text-document
super().parse(inputstring, document=document)
return
try:
ntbk = converter.func(inputstring)
except Exception as error:
SPHINX_LOGGER.error(
"MyST-NB: Conversion to notebook failed: %s",
error,
# exc_info=True,
location=(self.env.docname, 1),
)
return
# add outputs to notebook from the cache
if self.env.config["jupyter_execute_notebooks"] != "off":
ntbk = generate_notebook_outputs(
self.env, ntbk, show_traceback=self.env.config["execution_show_tb"]
)
# Parse the notebook content to a list of syntax tokens and an env
# containing global data like reference definitions
md_parser, env, tokens = patched_nb_to_tokens( # <-- patched here
ntbk,
(
self.env.myst_config # type: ignore[attr-defined]
if converter is None
else converter.config
),
self.env.config["nb_render_plugin"],
)
# Write the notebook's output to disk
path_doc = nb_output_to_disc(ntbk, document)
# Update our glue key list with new ones defined in this page
glue_domain = NbGlueDomain.from_env(self.env)
glue_domain.add_notebook(ntbk, path_doc)
# Render the Markdown tokens to docutils AST.
tokens_to_docutils(md_parser, env, tokens, document)
def patched_nb_to_tokens(
ntbk: nbf.NotebookNode, config: MdParserConfig, renderer_plugin: str
) -> Tuple[MarkdownIt, Dict[str, Any], List[Token]]:
"""Parse the notebook content to a list of syntax tokens and an env,
containing global data like reference definitions.
"""
md = default_parser(config)
# setup the markdown parser
# Note we disable front matter parsing,
# because this is taken from the actual notebook metadata
md.disable("front_matter", ignoreInvalid=True)
md.renderer = SphinxNBRenderer(md)
# make a sandbox where all the parsing global data,
# like reference definitions will be stored
env: Dict[str, Any] = {}
rules = md.core.ruler.get_active_rules()
# First only run pre-inline chains
# so we can collect all reference definitions, etc, before assessing references
def parse_block(src, start_line):
with md.reset_rules():
# enable only rules up to block
md.core.ruler.enableOnly(rules[: rules.index("inline")])
tokens = md.parse(src, env)
for token in tokens:
if token.map:
token.map = [start_line + token.map[0], start_line + token.map[1]]
for dup_ref in env.get("duplicate_refs", []):
if "fixed" not in dup_ref:
dup_ref["map"] = [
start_line + dup_ref["map"][0],
start_line + dup_ref["map"][1],
]
dup_ref["fixed"] = True
return tokens
block_tokens = []
source_map = ntbk.metadata.get("source_map", None)
# get language lexer name
langinfo = ntbk.metadata.get("language_info", {})
lexer = langinfo.get("pygments_lexer", langinfo.get("name", None))
if lexer is None:
ntbk.metadata.get("kernelspec", {}).get("language", None)
# TODO log warning if lexer is still None
for cell_index, nb_cell in enumerate(ntbk.cells):
# if the the source_map has been stored (for text-based notebooks),
# we use that do define the starting line for each cell
# otherwise, we set a pseudo base that represents the cell index
start_line = source_map[cell_index] if source_map else (cell_index + 1) * 10000
start_line += 1 # use base 1 rather than 0
# Skip empty cells
if len(nb_cell["source"].strip()) == 0:
continue
# skip cells tagged for removal
# TODO this logic should be deferred to a transform
tags = nb_cell.metadata.get("tags", [])
if ("remove_cell" in tags) or ("remove-cell" in tags):
continue
### Patched here
# Add a Token with a cell_meta directive, i.e.:
#
# ```cell_meta
# {"slideshow": {"slide_type": "slide"}}
# ```
block_tokens.append(
Token(
type="fence",
tag="code",
nesting=0,
attrs={},
map=[start_line, start_line],
level=0,
children=None,
content=json.dumps(nb_cell.metadata),
markup="```",
info="{cell_meta}",
meta={},
block=True,
hidden=False,
)
)
### / Patched here
if nb_cell["cell_type"] == "markdown":
# we add the cell index to tokens,
# so they can be included in the error logging,
block_tokens.extend(parse_block(nb_cell["source"], start_line))
elif nb_cell["cell_type"] == "code":
# here we do nothing but store the cell as a custom token
block_tokens.append(
Token(
"nb_code_cell",
"",
0,
meta={"cell": nb_cell, "lexer": lexer, "renderer": renderer_plugin},
map=[start_line, start_line],
)
)
# Now all definitions have been gathered,
# we run inline and post-inline chains, to expand the text.
# Note we assume here that these rules never require the actual source text,
# only acting on the existing tokens
state = StateCore("", md, env, block_tokens)
with md.reset_rules():
md.core.ruler.enableOnly(rules[rules.index("inline") :])
md.core.process(state)
# Add the front matter.
# Note that myst_parser serialises dict/list like keys, when rendering to
# docutils docinfo. These could be read back with `json.loads`.
state.tokens = [
Token(
"front_matter",
"",
0,
map=[0, 0],
content=({k: v for k, v in ntbk.metadata.items()}), # type: ignore[arg-type]
)
] + state.tokens
# If there are widgets, this will embed the state of all widgets in a script
if contains_widgets(ntbk):
state.tokens.append(
Token(
"jupyter_widget_state",
"",
0,
map=[0, 0],
meta={"state": get_widgets(ntbk)},
)
)
return md, env, state.tokens