2025-07-09
This is Blarg, version 3 (probably). It used to be an extremely complicated contraption involving auto-generated Makefiles. But I think even if I write several entries a day for the rest of my life, I’ll never have more than 100,000 files to deal with, and it just doesn’t seem worth the complexity. So now, I think a better way to Blarg is to just have a Jupyter notebook that builds the whole thing in a forward pass. Focus on writing, not perfect efficiency!
Why not Quarto? Well, because I want to learn. I want the output to be as simple, minimal, and understandable as possible, and usually the best way to do that is by removing tools from the stack instead of adding more.
Quarto’s popularity for technical publishing looks like it’s really a growing popularity of Pandoc — or more fairly, Quarto is making Pandoc easy enough to use that more people are using it. Why not just use Pandoc directly?
Here is some stuff we’ll need:
import json
import os
from contextlib import chdir
from mimetypes import add_type, guess_type
from pathlib import Path
from shutil import copyfile
from subprocess import PIPE, run
from typing import Iterator
from urllib.parse import quote
from urllib.request import urlopen
import bibtexparser
import pandas as pd
import toolz.curried as tz
from pandas import DataFrame, Series, Timestamp
from slugify import slugify
from tqdm.auto import tqdm
Root URL of the site. In general, the build process attempts to use relative URIs everywhere. This is currently only used for the feed generation.
= "https://danielgrady.net" SITEURL
Support files for Pandoc: filters and templates.
= str(Path.cwd() / "pandoc-data")
PANDOCDATA PANDOCDATA
The path to the hierarchy of source files.
= Path.cwd().parent.parent
ROOT ROOT
BibLaTeX bibliography.
= ROOT / "ref/references.bib" REF
Directory to write out the built site.
= ROOT / ".build"
OUT =True)
OUT.mkdir(exist_ok OUT
Directory for cache files.
= ROOT / ".cache"
CACHE =True)
CACHE.mkdir(exist_ok CACHE
Path to use for automatically generated BibLaTeX entries.
= CACHE / "references-auto.bib" REFAUTO
Blarg ignores files and directories with leading dots, and it also ignores the following top-level directories:
= [str(ROOT / p) for p in ["ref", "template"]] IGNOREDIRS
The first step is to get a comprehensive index of all the source files. Create a table of every file, and then split the table into separate indexes for documents and other files, called “assets.”
Identify documents using IANA media types. This puts all the logic around file extensions and such into one place.
"text/markdown", ".md")
add_type("text/markdown", ".mdown")
add_type("text/markdown", ".markdown")
add_type("text/x-org", ".org")
add_type("application/ipynb+json", ".ipynb") add_type(
This dictionary maps the IANA mediatypes that Blarg considers to be "documents" to the Pandoc reader format string to use for parsing the document.
= {
DOCUMENT_MEDIATYPES "text/markdown": "markdown+wikilinks_title_after_pipe",
"application/ipynb+json": "ipynb+wikilinks_title_after_pipe",
"text/x-org": "org",
}
“Indexing” an individual file just means looking up file-level metadata from the filesystem, and guessing what media type the file is using the standard library’s tool.
def load_file_metadata(p: Path) -> dict:
"""
General metadata for a file
Fields are named as in `stat`:
- st_birthtime: when the file was created
- st_atime: last access time
- st_mtime: file contents modified
- st_ctime: on macOS, file metadata modified
"""
= p.stat()
stat = dict()
result = guess_type(p)
mediatype, compression = p.stat()
tmp
result.update(
{"path": p,
"mediatype": mediatype,
"compression": compression,
"size": stat.st_size,
"st_birthtime": tmp.st_birthtime,
"st_atime": tmp.st_atime,
"st_mtime": tmp.st_mtime,
"st_ctime": tmp.st_ctime,
}
)return result
Index all files under the root, ignoring directories and files with leading dots.
def files_under(root: Path) -> Iterator[Path]:
"""
Yield paths to files in the hierarchy at `root`
Yield only files, not directories
Ignore files and directories with a leading dot
"""
# This relies on a weird but documented and recommended behavior - modify the list of subdirs
# inside the loop to inform `os.walk` to avoid certain subdirectories.
for directory, subdirs, files in os.walk(root):
if directory in IGNOREDIRS:
continue
= [p for p in subdirs if p.startswith(".")]
hidden_subdirs for p in hidden_subdirs:
subdirs.remove(p)
= [p for p in files if p.startswith(".")]
hidden_files for p in hidden_files:
files.remove(p)
= Path(directory)
dp for file in files:
yield dp.joinpath(file)
def index_tree(root: Path) -> DataFrame:
"""
Create an index of files under ``root``
Get filesystem metadata for each file, as well as inferred mimetypes and compression
"""
= list()
idx for p in files_under(root):
idx.append(load_file_metadata(p))= DataFrame(idx)
idx 0, "relpath", idx["path"].apply(lambda p: p.relative_to(root)))
idx.insert(return idx
= index_tree(ROOT) idx
Ignore certain kinds of files.
= idx["path"].apply(lambda p: p.suffix in (".canvas", ".pxm"))
mask = idx[~mask].set_index("path", drop=False).sort_index().copy() idx
5) idx.head(
Files are either assests, or documents.
Assets will be just copied to the site directory, with some slight modification to their parent path.
A document has additional, arbitrary, metadata from the file’s frontmatter, and Blarg will additionally infer or adjust some metadata.
Documents can contain hyperlinks (point internally or externally), wiki links (point internally, fuzzy search), and citations.
Citations are identified with cite keys. A cite key is a URI, and might be listed in the bibliography.
= idx["mediatype"].isin(DOCUMENT_MEDIATYPES) is_doc
Yes. This is a good name.
= idx[~is_doc].copy()
assidx = idx[is_doc].copy() docidx
Documents have all the same indexing information as assets, and get other stuff in addition.
def load_document_metadata(p: Path, mediatype: str) -> dict:
"""
Get metadata for a document
This loads the information the document records about itself. The filesystem has other things to
say about the file containing the document, not handled here.
This function uses Pandoc to extract YAML front matter, and also a mapping that includes all
cite keys, URL link targets, and eventually other things.
The trick to making this work is using a Pandoc template that contains nothing except the
`meta-json` template variable.
"""
# fmt: off
= [
args "pandoc",
"--from", DOCUMENT_MEDIATYPES[mediatype],
"--to", "commonmark", "--standalone",
"--data-dir", PANDOCDATA,
"--template", "metadata.pandoctemplate",
"--lua-filter", "analyze-document.lua",
str(p),
]# fmt: on
= run(args, check=True, stdout=PIPE)
proc = json.loads(proc.stdout)
frontmatter = frontmatter["docmap"]
docmap del frontmatter["docmap"]
for _, stuff in docmap.items():
"order"] = int(stuff["order"])
stuff["level"] = int(stuff["level"])
stuff[= {"fm": frontmatter, "docmap": docmap}
result return result
0] docidx.iloc[
= docidx.iloc[0]
entry "path"], entry["mediatype"]) load_document_metadata(entry[
= {p: None for p in docidx["path"]}
tmp for _, entry in tqdm(docidx.iterrows(), total=len(docidx)):
= entry["path"]
p = entry["mediatype"]
mt = load_document_metadata(p, mt)
tmp[p] = Series(tmp) docmeta
"frontmatter"] = docmeta.apply(lambda d: d["fm"])
docidx["docmap"] = docmeta.apply(lambda d: d["docmap"]) docidx[
At this point, docidx
includes filesystem metadata, all of the document's frontmatter (if any), and a document map.
5) docidx.head(
Next, calculate several pieces of derived metadata:
published
: required; the initial publication date of the document. If it’s not declared in the document metadata, use the filesystem creation time.updated
: optional; the last date when the document was significantly revised; only assigned if it’s present in the document's front matter.shorttitle
: required; always the file name stem. Used for breadcrumb display.title
: required. Use the value declared in the front matter if present, otherwise it's the same as shorttitle
.(NB The Atom specification works the other way around with respect to timestamps: updated
is required, published
is optional.)
Note that, in an earlier iteration of this notebook, log entries and notes were more clearly distinguished. Now, they are (should be) exactly the same, just different places to put things. The dates for log entries come from the front matter and filesystem, and note from the path to the entry.
There may be other fields present in the document front matter that will be rendered in the final output based on the template, for example subtitle
.
'future.no_silent_downcasting', True) pd.set_option(
# Convert, or assume, all timestamps to Pacific time
= docidx["frontmatter"].apply(
fmdates lambda d: Timestamp(d["published"], tz="US/Pacific") if "published" in d else None
)
# This is a very annoying feature of Pandas. In `Timestamp.fromtimestamp(x)`, x is always an
# absolute POSIX timestamp. Calling the function like that returns a timezone-*naive* Timestamp, but
# where `x` has been converted to display in the running system's *local* time. Calling
# `Timestamp.fromtimestamp(x, tz=TZ)` returns a timezone-*aware* Timestamp, with x converted to that
# timezone.
= docidx["st_birthtime"].apply(
fsdates lambda x: Timestamp.fromtimestamp(x, tz="US/Pacific")
)
= fmdates.combine_first(fsdates)
tmp = pd.to_datetime(tmp)
tmp
"published"] = tmp docidx[
= docidx["path"].apply(lambda p: p.stem)
shorttitles = docidx["frontmatter"].apply(lambda d: d.get("title"))
fmtitles = fmtitles.combine_first(shorttitles)
titles "title"] = titles
docidx["shorttitle"] = shorttitles docidx[
Generate a "site path" for every asset and document. The site path is the absolute path to the resource, as accessed via HTTP. The actual output file will be at site path + "index.html".
For assets, the site path and the output file path are the same.
For documents, the normal case is:
Egg Recipe/With Spam.md
egg-recipe/with-spam
egg-recipe/with-spam/index.html
There are two special cases for documents:
Egg Recipe/index.*
-> egg-recipe
(/index.html
)Egg Recipe/Egg Recipe.*
-> egg-recipe
(/index.html
)This accommodates the Directory-based notes should repeat the directory name convention, and the older convention.
Relative path components are processed with slugify
to get clean URL slug.
Document metadata may override the generated slug, which will replace the final component of the site path.
For regular (non-document) files, all path components except the filename are slugified. I think this will handle the common case of support files that are stored as siblings of the document.
def relpath2sitepath(p: Path, is_document=True):
if is_document:
= p.with_suffix("")
p = p.parts
parts if (parts[-1] == "index") or (len(parts) > 1 and parts[-2] == parts[-1]):
= parts[:-1]
parts = tuple(slugify(pt) for pt in parts)
parts else:
= p.parts
parts = tuple(slugify(pt) for pt in parts[:-1]) + (parts[-1],)
parts = Path().joinpath(*parts)
sitepath return sitepath
"sitepath"] = docidx["relpath"].apply(relpath2sitepath)
docidx["outpath"] = docidx["sitepath"].apply(lambda p: p.joinpath("index.html"))
docidx[
"sitepath"] = assidx["relpath"].apply(lambda p: relpath2sitepath(p, is_document=False))
assidx["outpath"] = assidx["sitepath"].copy() assidx[
5) docidx.head(
5) assidx.head(
I use iA Writer and Obsidian to author Blarg. Both of those programs have great support for very similar syntax for wiki-style links, and the published version of Blarg should support wiki-link resolution that’s similar enough to those programs’ rules. Pandoc already provides wiki-link parsing, but figuring out the link targets and rewriting them is something Blarg needs to handle.
Obsidian and iA both use wiki-link resolution relative to the location of the containing file, for example [[Spam]]
might point to different files depending on where the wiki-link appears. As a first pass, I’m going to ignore this — any text in the wiki-link target gets a single constant rewrite.
The wiki-link targets might be generated by Obisidian, which allows #
within the target to refer to subsections. For now, remove those.
# fmt: off
= tz.pipe(
wikilink_targets "docmap"], # Start with a list of all document maps; each maps header ID -> metadata
docidx[map(lambda dm: dm.values()), # Extract just the metadata
tz.# Flatten the list of lists
tz.concat, map(tz.get("wikilinks")), # Extract the wikilinks used under every heading
tz.filter(None), # Remove empty sets
tz.# Flatten again
tz.concat, map(lambda s: s.split("#")[0]), # Remove Obsidian-style heading references
tz.set, # Deduplicate
list, Series,
)# fmt: on
TODO: Should extend this to allow for wikilinks with absolute paths, for example:
lambda df: df.str.startswith("log/")] wikilink_targets[
TODO: And handle the case of references to static assets:
lambda df: df.str.contains("pdf")] wikilink_targets[
Create a mapping that goes from all the possible targets of wiki-style links to the corresponding sitepath
in the output. The possible wiki-style link targets are the filename stems of all documents.
= (
tmp_mapping "relpath"].apply(lambda p: p.stem).rename("wikilink_target"))
docidx.join(docidx[=["wikilink_target", "sitepath"])
.drop_duplicates(subset"wikilink_target")["sitepath"]
.set_index(
.sort_index()
) tmp_mapping
I'm assuming that I've uniquely named all files.
assert tmp_mapping.index.is_unique
= wikilink_targets.map(tmp_mapping)
wikilink_map = wikilink_targets.values
wikilink_map.index = wikilink_map.dropna().sort_index().apply(str)
wikilink_map with open(CACHE / "wikilink-map.json", "w", encoding="UTF-8") as f:
json.dump(wikilink_map.to_dict(), f) wikilink_map
Every mentioned cite key or URL needs associated metadata.
First, find all the cite keys with manually prepared entries.
= bibtexparser.parse_file(REF)
library = [e.key for e in library.entries]
known_citekeys for e in library.entries:
if "ids" in e:
"ids").value)
known_citekeys.append(e.get(= frozenset(known_citekeys)
known_citekeys len(known_citekeys)
Second, create a table that maps every mention of a cite key or URL to the sitepath + fragment where it's mentioned.
def docmap2mentions(d):
= []
result for fragment, data in d.items():
for citekey in data["cites"]:
"cite"))
result.append((fragment, citekey, for link in data["links"]:
"link"))
result.append((fragment, link, return result
= tz.pipe(
refmap
docidx.iterrows(),map(tz.get(1)),
tz.map(lambda row: [(row["sitepath"], ) + t for t in docmap2mentions(row["docmap"])]),
tz.
tz.concat,list,
lambda lst: pd.DataFrame(lst, columns=["sitepath", "fragment", "uri", "type"])
)
5, random_state=42) refmap.sample(
Find all the mentioned cite keys that don't have a manually written entry.
= frozenset(refmap[lambda df: df["type"].eq("cite")]["uri"])
mentioned_citekeys len(mentioned_citekeys)
= mentioned_citekeys - known_citekeys
missing_keys missing_keys
Get bibliographic info for every missing cite key using Wikipedia's instance of Citoid, or the arXiv API directly. (Citoid does not seem to support arXiv article IDs.)
= "https://en.wikipedia.org/api/rest_v1/data/citation/bibtex/{query}"
CITOID = "https://arxiv.org/bibtex/{query}"
ARXIV
def get_bibentry(query: str):
= ARXIV if query.startswith("arxiv:") else CITOID
url = url.format(query=quote(query, safe=""))
url try:
with urlopen(url) as f:
= f.read()
data = data.decode("UTF-8")
result = result.strip()
result except Exception:
= None
result return result
= {k: get_bibentry(k) for k in tqdm(missing_keys)}
tmp tmp
= []
tmp2 for k, v in tmp.items():
= bibtexparser.parse_string(v)
library for e in library.entries:
= k
e.key if k.startswith("arxiv:"):
# arXiv-only publications are just puffed-up blog posts; don't dignify them.
= "online"
e.entry_type
tmp2.append(e)
= bibtexparser.Library(tmp2) newlib
str(REFAUTO), newlib) bibtexparser.write_file(
= pd.date_range("2010-01-01", "2025-12-31")
data = pd.DataFrame({"date": data}, index=data).assign(
data =data.year,
year=data.month,
month=data.day,
day
)= data.join(data["date"].dt.isocalendar().rename(columns=lambda s: f"week_{s}"))
data
"Week"] = data["week_week"].copy()
data["year"] > data["week_year"], "Week"] = 0
data.loc[data["year"] < data["week_year"], "Week"] = 54
data.loc[data[
data
= {1: "M", 2: "T", 3: "W", 4: "R", 5: "F", 6: "S", 7: "U"} WEEKDAYS
= frozenset(docidx["published"].dt.date)
KNOWN_DATES
def format_date(dt):
if dt.date() in KNOWN_DATES:
return f'<a href="/log/{dt.year}/{dt.date()}">{dt.day}</a>'
else:
return str(dt.day)
= (
disp "year", "week_day", "Week"])["date"]
data.set_index([
.unstack()=[False, True])
.sort_index(ascending
)
= pd.DataFrame(data="", index=disp.index.copy(), columns=disp.columns.copy())
classes
# Anywhere the month to the left is not the same as the current month, add a class
= disp.map(lambda dt: dt.month).ffill(axis=1).bfill(axis=1)
m = m != m.shift(1, axis=1)
mask 0] = False # Ignore the first column
mask.loc[:, = classes[mask] + "month-change-left "
classes[mask]
# Anywhere the month above is not the same as the current month, except for Mondays, add a class
= m != m.shift(1)
mask slice(None), 1), :] = False
mask.loc[(= classes[mask] + "month-change-above "
classes[mask]
= disp.style
sty = ["", ""]
sty.index.names = ""
sty.columns.name format(format_date, na_rep="")
sty.lambda x: WEEKDAYS[x], axis=0, level=1)
sty.format_index(
sty.set_td_classes(classes)'class="masterlog"')
sty.set_table_attributes(
None
def write_calendar(root: Path, calhtml: str):
= root / "log" / "index.html"
outpath =True, parents=True)
outpath.parent.mkdir(exist_ok# fmt: off
= [
cmd "pandoc",
"--from", "html", "--to", "html5", "--standalone", "--wrap", "none",
"--data-dir", PANDOCDATA,
"--mathjax",
"--metadata", "title=Log",
"--metadata", "date=" + Timestamp.now().date().isoformat(),
"--output", str(outpath),
"-"
]# fmt: on
= run(cmd, input=calhtml.encode("UTF-8"), check=True)
proc return proc
write_calendar(OUT, sty.to_html())
“Incremental updates:” Most of the content managed by Blarg is one-to-one — one source file goes to one site path. In the barest and cheapest of nods to efficiency, Blarg checks for existence of the target output file and compares modification times; if the output exists and is newer than the input, then skip. I think this does actually save real-world time because rendering an entry involves a Pandoc subprocess, which is more time consuming than a stat call.
(NB In an earlier version the check was using the output file's st_birthtime
, but birth time is not updated if a file is overwritten in place, leading to a situation where the check should have skipped a file but did not.)
# Copy the site-wide CSS from the Pandoc templates directory to the site root. It lives in the
# Pandoc templates directory to prevent Pandoc from using its default CSS when generating HTML.
/ "templates" / "styles.css", OUT / "styles.css")
copyfile(Path(PANDOCDATA)
for _, entry in assidx.iterrows():
= OUT / entry["outpath"]
outpath: Path if outpath.exists() and entry["st_mtime"] <= outpath.stat().st_mtime:
continue
else:
=True, parents=True)
outpath.parent.mkdir(exist_ok"path"], outpath) copyfile(entry[
def write_document_under(root: Path, doc: dict):
= root / doc["outpath"]
outpath =True, parents=True)
outpath.parent.mkdir(exist_ok# fmt: off
= [
cmd "pandoc",
"--from", DOCUMENT_MEDIATYPES[doc["mediatype"]],
"--to", "html5", "--standalone", "--wrap", "none",
"--data-dir", PANDOCDATA, "--mathjax",
"--citeproc", "--bibliography", str(REFAUTO), "--bibliography", str(REF),
"--csl", "chicago-fullnote-bibliography-short-title-subsequent.csl",
"--filter", "blargify.py",
"--lua-filter", "diagram.lua",
"--extract-media=.",
"--metadata", f"title={doc["title"]}",
"--metadata", f"date={str(doc["published"].date())}",
# "--metadata", f"editlink={doc["editlink"]}",
"--output", str(outpath.name), str(entry["path"])
]# fmt: on
with chdir(outpath.parent):
= run(cmd, check=True)
proc return proc
for _, entry in tqdm(docidx.iterrows(), total=len(docidx)):
= OUT / entry["outpath"]
outpath: Path if outpath.exists() and (entry["st_mtime"] < outpath.stat().st_mtime):
continue
else:
write_document_under(OUT, entry)
= f"""\
FEED_HEADER <?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
<id>https://danielgrady.net</id>
<title>Daniel Grady’s web log</title>
<subtitle>∇⋅∇𝒴</subtitle>
<author>
<name>Daniel Grady</name>
<uri>https://danielgrady.net</uri>
</author>
<link href="https://danielgrady.net/atom.xml" rel="self"/>
<link href="https://danielgrady.net" rel="alternate"/>
<logo>https://danielgrady.net/favicon.ico</logo>
<updated>{Timestamp.now(tz='US/Pacific').isoformat(timespec='seconds')}</updated>
"""
TODO Add the actual content of the entries to the feed.
= """
ENTRY_TEMPLATE <entry>
<id>{uri}</id>
<title>{title}</title>
<link rel="alternate" href="{uri}"/>
<published>{published}</published>
<updated>{updated}</updated>
</entry>
"""
= docidx[lambda df: ~df["sitepath"].eq(Path("."))]
feeditems = feeditems.sort_values("published", ascending=False)
feeditems
= FEED_HEADER
feed
for _, entry in feeditems.iterrows():
= ENTRY_TEMPLATE.format(
tmp =f"{SITEURL}/{entry['sitepath']}",
uri=entry["title"],
title=entry["published"].isoformat(timespec="seconds"),
published=entry["published"].isoformat(timespec="seconds"),
updated
)+= tmp
feed
+= "</feed>" feed
with open(OUT / "atom.xml", "w", encoding="UTF-8") as f:
f.write(feed)