Lindenii Project Forge
Login

scdoc

scdoc mirror for performance testing
Commit info
ID
afeda241f3f9b2c27e461f32d9c2a704ab82ef61
Author
Eli Schwartz <eschwartz93@gmail.com>
Author date
Wed, 08 Mar 2023 12:14:00 -0500
Committer
Drew DeVault <sir@cmpwn.com>
Committer date
Mon, 13 Mar 2023 12:53:39 +0100
Actions
get rid of some unused includes

In main.c, unistd.h is a fairly big stick to use just to get stdint
types, so downgrade that header.
#ifndef _SCDOC_PARSER_H
#define _SCDOC_PARSER_H
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>

struct parser {
	FILE *input, *output;
	int line, col;
	int qhead;
	uint32_t queue[32];
	uint32_t flags;
	const char *str;
	int fmt_line, fmt_col;
};

enum formatting {
	FORMAT_BOLD = 1,
	FORMAT_UNDERLINE = 2,
	FORMAT_LAST = 4,
};

void parser_fatal(struct parser *parser, const char *err);
uint32_t parser_getch(struct parser *parser);
void parser_pushch(struct parser *parser, uint32_t ch);
void parser_pushstr(struct parser *parser, const char *str);
int roff_macro(struct parser *p, char *cmd, ...);
void *xcalloc(size_t n, size_t s);
void *xrealloc(void *p, size_t s);

#endif
#define _XOPEN_SOURCE 600
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include "str.h"
#include "unicode.h"
#include "util.h"

static struct str *parse_section(struct parser *p) {
	struct str *section = str_create();
	uint32_t ch;
	char *subsection;
	while ((ch = parser_getch(p)) != UTF8_INVALID) {
		if (ch < 0x80 && isalnum((unsigned char)ch)) {
			int ret = str_append_ch(section, ch);
			assert(ret != -1);
		} else if (ch == ')') {
			if (section->len == 0) {
				break;
			}
			int sec = strtol(section->str, &subsection, 10);
			if (section->str == subsection) {
				parser_fatal(p, "Expected section digit");
				break;
			}
			if (sec < 0 || sec > 9) {
				parser_fatal(p, "Expected section between 0 and 9");
				break;
			}
			return section;
		} else {
			parser_fatal(p, "Expected alphanumerical character or )");
			break;
		}
	};
	parser_fatal(p, "Expected manual section");
	return NULL;
}

static struct str *parse_extra(struct parser *p) {
	struct str *extra = str_create();
	int ret = str_append_ch(extra, '"');
	assert(ret != -1);
	uint32_t ch;
	while ((ch = parser_getch(p)) != UTF8_INVALID) {
		if (ch == '"') {
			ret = str_append_ch(extra, ch);
			assert(ret != -1);
			return extra;
		} else if (ch == '\n') {
			parser_fatal(p, "Unclosed extra preamble field");
			break;
		} else {
			ret = str_append_ch(extra, ch);
			assert(ret != -1);
		}
	}
	str_free(extra);
	return NULL;
}

static void parse_preamble(struct parser *p) {
	struct str *name = str_create();
	int ex = 0;
	struct str *extras[2] = { NULL };
	struct str *section = NULL;
	uint32_t ch;
	time_t date_time;
	char date[256];
	char *source_date_epoch = getenv("SOURCE_DATE_EPOCH");
	if (source_date_epoch != NULL) {
		unsigned long long epoch;
		char *endptr;
		errno = 0;
		epoch = strtoull(source_date_epoch, &endptr, 10);
		if ((errno == ERANGE && (epoch == ULLONG_MAX || epoch == 0))
				|| (errno != 0 && epoch == 0)) {
			fprintf(stderr, "$SOURCE_DATE_EPOCH: strtoull: %s\n",
					strerror(errno));
			exit(EXIT_FAILURE);
		}
		if (endptr == source_date_epoch) {
			fprintf(stderr, "$SOURCE_DATE_EPOCH: No digits were found: %s\n",
					endptr);
			exit(EXIT_FAILURE);
		}
		if (*endptr != '\0') {
			fprintf(stderr, "$SOURCE_DATE_EPOCH: Trailing garbage: %s\n",
					endptr);
			exit(EXIT_FAILURE);
		}
		if (epoch > ULONG_MAX) {
			fprintf(stderr, "$SOURCE_DATE_EPOCH: value must be smaller than or "
					"equal to %lu but was found to be: %llu \n",
					ULONG_MAX, epoch);
			exit(EXIT_FAILURE);
		}
		date_time = epoch;
	} else {
		date_time = time(NULL);
	}
	struct tm *date_tm = gmtime(&date_time);
	strftime(date, sizeof(date), "%F", date_tm);
	while ((ch = parser_getch(p)) != UTF8_INVALID) {
		if ((ch < 0x80 && isalnum((unsigned char)ch))
				|| ch == '_' || ch == '-' || ch == '.') {
			int ret = str_append_ch(name, ch);
			assert(ret != -1);
		} else if (ch == '(') {
			section = parse_section(p);
		} else if (ch == '"') {
			if (ex == 2) {
				parser_fatal(p, "Too many extra preamble fields");
			}
			extras[ex++] = parse_extra(p);
		} else if (ch == '\n') {
			if (name->len == 0) {
				parser_fatal(p, "Expected preamble");
			}
			if (section == NULL) {
				parser_fatal(p, "Expected manual section");
			}
			char *ex2 = extras[0] != NULL ? extras[0]->str : NULL;
			char *ex3 = extras[1] != NULL ? extras[1]->str : NULL;
			fprintf(p->output, ".TH \"%s\" \"%s\" \"%s\"", name->str, section->str, date);
			/* ex2 and ex3 are already double-quoted */
			if (ex2) {
				fprintf(p->output, " %s", ex2);
			}
			if (ex3) {
				fprintf(p->output, " %s", ex3);
			}
			fprintf(p->output, "\n");
			break;
		} else if (section == NULL) {
			parser_fatal(p, "Name characters must be A-Z, a-z, 0-9, `-`, `_`, or `.`");
		}
	}
	str_free(name);
	for (int i = 0; i < 2; ++i) {
		if (extras[i] != NULL) {
			str_free(extras[i]);
		}
	}
}

static void parse_format(struct parser *p, enum formatting fmt) {
	char formats[FORMAT_LAST] = {
		[FORMAT_BOLD] = 'B',
		[FORMAT_UNDERLINE] = 'I',
	};
	char error[512];
	if (p->flags) {
		if ((p->flags & ~fmt)) {
			snprintf(error, sizeof(error), "Cannot nest inline formatting "
						"(began with %c at %d:%d)",
					p->flags == FORMAT_BOLD ? '*' : '_',
					p->fmt_line, p->fmt_col);
			parser_fatal(p, error);
		}
		fprintf(p->output, "\\fR");
	} else {
		fprintf(p->output, "\\f%c", formats[fmt]);
		p->fmt_line = p->line;
		p->fmt_col = p->col;
	}
	p->flags ^= fmt;
}

static bool parse_linebreak(struct parser *p) {
	uint32_t plus = parser_getch(p);
	if (plus != '+') {
		fprintf(p->output, "+");
		parser_pushch(p, plus);
		return false;
	}
	uint32_t lf = parser_getch(p);
	if (lf != '\n') {
		fprintf(p->output, "+");
		parser_pushch(p, lf);
		parser_pushch(p, plus);
		return false;
	}
	uint32_t ch = parser_getch(p);
	if (ch == '\n') {
		parser_fatal(
				p, "Explicit line breaks cannot be followed by a blank line");
	}
	parser_pushch(p, ch);
	fprintf(p->output, "\n.br\n");
	return true;
}

static void parse_text(struct parser *p) {
	uint32_t ch, next, last = ' ';
	int i = 0;
	while ((ch = parser_getch(p)) != UTF8_INVALID) {
		switch (ch) {
		case '\\':
			ch = parser_getch(p);
			if (ch == UTF8_INVALID) {
				parser_fatal(p, "Unexpected EOF");
			} else if (ch == '\\') {
				fprintf(p->output, "\\\\");
			} else {
				utf8_fputch(p->output, ch);
			}
			break;
		case '*':
			parse_format(p, FORMAT_BOLD);
			break;
		case '_':
			next = parser_getch(p);
			if (!isalnum((unsigned char)last) || (
						(p->flags & FORMAT_UNDERLINE) &&
						!isalnum((unsigned char)next))) {
				parse_format(p, FORMAT_UNDERLINE);
			} else {
				utf8_fputch(p->output, ch);
			}
			if (next == UTF8_INVALID) {
				return;
			}
			parser_pushch(p, next);
			break;
		case '+':
			if (parse_linebreak(p)) {
				last = '\n';
			}
			break;
		case '\n':
			utf8_fputch(p->output, ch);
			return;
		case '.':
			if (!i) {
				// Escape . if it's the first character
				fprintf(p->output, "\\&.\\&");
				break;
			}
			/* fallthrough */
		case '\'':
			if (!i) {
				// Escape ' if it's the first character
				fprintf(p->output, "\\&'\\&");
				break;
			}
			/* fallthrough */
		case '!':
		case '?':
			last = ch;
			utf8_fputch(p->output, ch);
			// Suppress sentence spacing
			fprintf(p->output, "\\&");
			break;
		default:
			last = ch;
			utf8_fputch(p->output, ch);
			break;
		}
		++i;
	}
}

static void parse_heading(struct parser *p) {
	uint32_t ch;
	int level = 1;
	while ((ch = parser_getch(p)) != UTF8_INVALID) {
		if (ch == '#') {
			++level;
		} else if (ch == ' ') {
			break;
		} else {
			parser_fatal(p, "Invalid start of heading (probably needs a space)");
		}
	}
	switch (level) {
	case 1:
		fprintf(p->output, ".SH ");
		break;
	case 2:
		fprintf(p->output, ".SS ");
		break;
	default:
		parser_fatal(p, "Only headings up to two levels deep are permitted");
		break;
	}
	while ((ch = parser_getch(p)) != UTF8_INVALID) {
		utf8_fputch(p->output, ch);
		if (ch == '\n') {
			break;
		}
	}
}

static int parse_indent(struct parser *p, int *indent, bool write) {
	int i = 0;
	uint32_t ch;
	while ((ch = parser_getch(p)) == '\t') {
		++i;
	}
	parser_pushch(p, ch);
	if ((ch == '\n' || ch == UTF8_INVALID) && *indent != 0) {
		// Don't change indent when we encounter empty lines or EOF
		return *indent;
	}
	if (write) {
		if ((i - *indent) > 1) {
			parser_fatal(p, "Indented by an amount greater than 1");
		} else if (i < *indent) {
			for (int j = *indent; i < j; --j) {
				roff_macro(p, "RE", NULL);
			}
		} else if (i == *indent + 1) {
			fprintf(p->output, ".RS 4\n");
		}
	}
	*indent = i;
	return i;
}

static void list_header(struct parser *p, int *num) {
	if (*num == -1) {
		fprintf(p->output, ".IP %s 4\n", "\\(bu");
	} else {
		fprintf(p->output, ".IP %d. 4\n", *num);
		*num = *num + 1;
	}
}

static void parse_list(struct parser *p, int *indent, int num) {
	uint32_t ch;
	if ((ch = parser_getch(p)) != ' ') {
		parser_fatal(p, "Expected space before start of list entry");
	}
	fprintf(p->output, ".PD 0\n");
	list_header(p, &num);
	parse_text(p);
	do {
		parse_indent(p, indent, true);
		if ((ch = parser_getch(p)) == UTF8_INVALID) {
			break;
		}
		switch (ch) {
		case ' ':
			if ((ch = parser_getch(p)) != ' ') {
				parser_fatal(p, "Expected two spaces for list entry continuation");
			}
			parse_text(p);
			break;
		case '-':
		case '.':
			if ((ch = parser_getch(p)) != ' ') {
				parser_fatal(p, "Expected space before start of list entry");
			}
			list_header(p, &num);
			parse_text(p);
			break;
		default:
			roff_macro(p, "PD", NULL);
			parser_pushch(p, ch);
			return;
		}
	} while (ch != UTF8_INVALID);
}

static void parse_literal(struct parser *p, int *indent) {
	uint32_t ch;
	if ((ch = parser_getch(p)) != '`' ||
		(ch = parser_getch(p)) != '`' ||
		(ch = parser_getch(p)) != '\n') {
		parser_fatal(p, "Expected ``` and a newline to begin literal block");
	}
	int stops = 0;
	roff_macro(p, "nf", NULL);
	fprintf(p->output, ".RS 4\n");
	bool check_indent = true;
	do {
		if (check_indent) {
			int _indent = *indent;
			parse_indent(p, &_indent, false);
			if (_indent < *indent) {
				parser_fatal(p, "Cannot deindent in literal block");
			}
			while (_indent > *indent) {
				--_indent;
				fprintf(p->output, "\t");
			}
			check_indent = false;
		}
		if ((ch = parser_getch(p)) == UTF8_INVALID) {
			break;
		}
		if (ch == '`') {
			if (++stops == 3) {
				if ((ch = parser_getch(p)) != '\n') {
					parser_fatal(p, "Expected literal block to end with newline");
				}
				roff_macro(p, "fi", NULL);
				roff_macro(p, "RE", NULL);
				return;
			}
		} else {
			while (stops != 0) {
				fputc('`', p->output);
				--stops;
			}
			switch (ch) {
			case '.':
				fprintf(p->output, "\\&.");
				break;
			case '\'':
				fprintf(p->output, "\\&'");
				break;
			case '\\':
				ch = parser_getch(p);
				if (ch == UTF8_INVALID) {
					parser_fatal(p, "Unexpected EOF");
				} else if (ch == '\\') {
					fprintf(p->output, "\\\\");
				} else {
					utf8_fputch(p->output, ch);
				}
				break;
			case '\n':
				check_indent = true;
				/* fallthrough */
			default:
				utf8_fputch(p->output, ch);
				break;
			}
		}
	} while (ch != UTF8_INVALID);
}

enum table_align {
	ALIGN_LEFT,
	ALIGN_CENTER,
	ALIGN_RIGHT,
	ALIGN_LEFT_EXPAND,
	ALIGN_CENTER_EXPAND,
	ALIGN_RIGHT_EXPAND,
};

struct table_row {
	struct table_cell *cell;
	struct table_row *next;
};

struct table_cell {
	enum table_align align;
	struct str *contents;
	struct table_cell *next;
};

static void parse_table(struct parser *p, uint32_t style) {
	struct table_row *table = NULL;
	struct table_row *currow = NULL, *prevrow = NULL;
	struct table_cell *curcell = NULL;
	int column = 0;
	int numcolumns = -1;
	uint32_t ch;
	parser_pushch(p, '|');

	do {
		if ((ch = parser_getch(p)) == UTF8_INVALID) {
			break;
		}
		switch (ch) {
		case '\n':
			goto commit_table;
		case '|':
			prevrow = currow;
			currow = xcalloc(1, sizeof(struct table_row));
			if (prevrow) {
				if (column != numcolumns && numcolumns != -1) {
					parser_fatal(p, "Each row must have the "
							"same number of columns");
				}
				numcolumns = column;
				column = 0;
				prevrow->next = currow;
			}
			curcell = xcalloc(1, sizeof(struct table_cell));
			currow->cell = curcell;
			if (!table) {
				table = currow;
			}
			break;
		case ':':
			if (!currow) {
				parser_fatal(p, "Cannot start a column without "
						"starting a row first");
			} else {
				struct table_cell *prev = curcell;
				curcell = xcalloc(1, sizeof(struct table_cell));
				if (prev) {
					prev->next = curcell;
				}
				++column;
			}
			break;
		case ' ':
			goto continue_cell;
		default:
			parser_fatal(p, "Expected either '|' or ':'");
			break;
		}
		if ((ch = parser_getch(p)) == UTF8_INVALID) {
			break;
		}
		switch (ch) {
		case '[':
			curcell->align = ALIGN_LEFT;
			break;
		case '-':
			curcell->align = ALIGN_CENTER;
			break;
		case ']':
			curcell->align = ALIGN_RIGHT;
			break;
		case '<':
			curcell->align = ALIGN_LEFT_EXPAND;
			break;
		case '=':
			curcell->align = ALIGN_CENTER_EXPAND;
			break;
		case '>':
			curcell->align = ALIGN_RIGHT_EXPAND;
			break;
		case ' ':
			if (prevrow) {
				struct table_cell *pcell = prevrow->cell;
				for (int i = 0; i <= column && pcell; ++i, pcell = pcell->next) {
					if (i == column) {
						curcell->align = pcell->align;
						break;
					}
				}
			} else {
				parser_fatal(p, "No previous row to infer alignment from");
			}
			break;
		default:
			parser_fatal(p, "Expected one of '[', '-', ']', or ' '");
			break;
		}
		curcell->contents = str_create();
continue_cell:
		switch (ch = parser_getch(p)) {
		case ' ':
			// Read out remainder of the text
			while ((ch = parser_getch(p)) != UTF8_INVALID) {
				switch (ch) {
				case '\n':
					goto commit_cell;
				default:;
					int ret = str_append_ch(curcell->contents, ch);
					assert(ret != -1);
					break;
				}
			}
			break;
		case '\n':
			goto commit_cell;
		default:
			parser_fatal(p, "Expected ' ' or a newline");
			break;
		}
commit_cell:
		if (strstr(curcell->contents->str, "T{")
				|| strstr(curcell->contents->str, "T}")) {
			parser_fatal(p, "Cells cannot contain T{ or T} "
					"due to roff limitations");
		}
	} while (ch != UTF8_INVALID);
commit_table:

	if (ch == UTF8_INVALID) {
		return;
	}

	roff_macro(p, "TS", NULL);

	switch (style) {
	case '[':
		fprintf(p->output, "allbox;");
		break;
	case ']':
		fprintf(p->output, "box;");
		break;
	}

	// Print alignments first
	currow = table;
	while (currow) {
		curcell = currow->cell;
		while (curcell) {
			char *align = "";
			switch (curcell->align) {
			case ALIGN_LEFT:
				align = "l";
				break;
			case ALIGN_CENTER:
				align = "c";
				break;
			case ALIGN_RIGHT:
				align = "r";
				break;
			case ALIGN_LEFT_EXPAND:
				align = "lx";
				break;
			case ALIGN_CENTER_EXPAND:
				align = "cx";
				break;
			case ALIGN_RIGHT_EXPAND:
				align = "rx";
				break;
			}
			fprintf(p->output, "%s%s", align, curcell->next ? " " : "");
			curcell = curcell->next;
		}
		fprintf(p->output, "%s\n", currow->next ? "" : ".");
		currow = currow->next;
	}

	// Then contents
	currow = table;
	while (currow) {
		curcell = currow->cell;
		fprintf(p->output, "T{\n");
		while (curcell) {
			parser_pushstr(p, curcell->contents->str);
			parse_text(p);
			if (curcell->next) {
				fprintf(p->output, "\nT}\tT{\n");
			} else {
				fprintf(p->output, "\nT}");
			}
			struct table_cell *prev = curcell;
			curcell = curcell->next;
			str_free(prev->contents);
			free(prev);
		}
		fprintf(p->output, "\n");
		struct table_row *prev = currow;
		currow = currow->next;
		free(prev);
	}

	roff_macro(p, "TE", NULL);
	fprintf(p->output, ".sp 1\n");
}

static void parse_document(struct parser *p) {
	uint32_t ch;
	int indent = 0;
	do {
		parse_indent(p, &indent, true);
		if ((ch = parser_getch(p)) == UTF8_INVALID) {
			break;
		}
		switch (ch) {
		case ';':
			if ((ch = parser_getch(p)) != ' ') {
				parser_fatal(p, "Expected space after ; to begin comment");
			}
			do {
				ch = parser_getch(p);
			} while (ch != UTF8_INVALID && ch != '\n');
			break;
		case '#':
			if (indent != 0) {
				parser_pushch(p, ch);
				parse_text(p);
				break;
			}
			parse_heading(p);
			break;
		case '-':
			parse_list(p, &indent, -1);
			break;
		case '.':
			if ((ch = parser_getch(p)) == ' ') {
				parser_pushch(p, ch);
				parse_list(p, &indent, 1);
			} else {
				parser_pushch(p, ch);
				parse_text(p);
			}
			break;
		case '`':
			parse_literal(p, &indent);
			break;
		case '[':
		case '|':
		case ']':
			if (indent != 0) {
				parser_fatal(p, "Tables cannot be indented");
			}
			parse_table(p, ch);
			break;
		case ' ':
			parser_fatal(p, "Tabs are required for indentation");
			break;
		case '\n':
			if (p->flags) {
				char error[512];
				snprintf(error, sizeof(error), "Expected %c before starting "
						"new paragraph (began with %c at %d:%d)",
						p->flags == FORMAT_BOLD ? '*' : '_',
						p->flags == FORMAT_BOLD ? '*' : '_',
						p->fmt_line, p->fmt_col);
				parser_fatal(p, error);
			}
			roff_macro(p, "PP", NULL);
			break;
		default:
			parser_pushch(p, ch);
			parse_text(p);
			break;
		}
	} while (ch != UTF8_INVALID);
}

static void output_scdoc_preamble(struct parser *p) {
	fprintf(p->output, ".\\\" Generated by scdoc " VERSION "\n");
	fprintf(p->output, ".\\\" Complete documentation for this program is not "
			"available as a GNU info page\n");
	// Fix weird quotation marks
	// http://bugs.debian.org/507673
	// http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html
	fprintf(p->output, ".ie \\n(.g .ds Aq \\(aq\n");
	fprintf(p->output, ".el       .ds Aq '\n");
	// Disable hyphenation:
	roff_macro(p, "nh", NULL);
	// Disable justification:
	roff_macro(p, "ad l", NULL);
	fprintf(p->output, ".\\\" Begin generated content:\n");
}

int main(int argc, char **argv) {
	if (argc == 2 && strcmp(argv[1], "-v") == 0) {
		printf("scdoc " VERSION "\n");
		return 0;
	} else if (argc > 1) {
		fprintf(stderr, "Usage: scdoc < input.scd > output.roff\n");
		return 1;
	}
	struct parser p = {
		.input = stdin,
		.output = stdout,
		.line = 1,
		.col = 1
	};
	output_scdoc_preamble(&p);
	parse_preamble(&p);
	parse_document(&p);
	return 0;
}
#include <stdint.h>
#include <stddef.h>
#include "unicode.h"

uint8_t masks[] = {
	0x7F,
	0x1F,
	0x0F,
	0x07,
	0x03,
	0x01
};

uint32_t utf8_decode(const char **char_str) {
	uint8_t **s = (uint8_t **)char_str;

	uint32_t cp = 0;
	if (**s < 128) {
		// shortcut
		cp = **s;
		++*s;
		return cp;
	}
	int size = utf8_size((char *)*s);
	if (size == -1) {
		++*s;
		return UTF8_INVALID;
	}
	uint8_t mask = masks[size - 1];
	cp = **s & mask;
	++*s;
	while (--size) {
		cp <<= 6;
		cp |= **s & 0x3f;
		++*s;
	}
	return cp;
}