From a4193949ec755b3848a803a9b02364eeddbb1455 Mon Sep 17 00:00:00 2001
From: Drew DeVault <sir@cmpwn.com>
Date: Sat, 09 Dec 2017 23:18:57 -0500
Subject: [PATCH] Initial commit

---
 .gitignore        |  1 +
 include/string.h  | 17 +++++++++++++++++
 include/unicode.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 include/util.h    | 16 ++++++++++++++++
 meson.build       | 29 +++++++++++++++++++++++++++++
 scdoc.5.scd       | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/main.c        | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/string.c      | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/utf8_chsize.c | 14 ++++++++++++++
 src/utf8_decode.c | 38 ++++++++++++++++++++++++++++++++++++++
 src/utf8_encode.c | 30 ++++++++++++++++++++++++++++++
 src/utf8_fgetch.c | 21 +++++++++++++++++++++
 src/utf8_fputch.c | 10 ++++++++++
 src/utf8_size.c   | 27 +++++++++++++++++++++++++++
 src/util.c        | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..378eac25d311703f3f2cd456d8036da525cd0366
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/include/string.h b/include/string.h
new file mode 100644
index 0000000000000000000000000000000000000000..45d15ec27fa19e498d15c133399c5a9a286c59f7
--- /dev/null
+++ b/include/string.h
@@ -0,0 +1,17 @@
+#ifndef _SCDOC_STRING_H
+#define _SCDOC_STRING_H
+#include <stdint.h>
+
+struct str {
+	char *str;
+	size_t len, size;
+};
+
+typedef struct str str_t;
+
+str_t *str_create();
+void str_free(str_t *str);
+void str_reset(str_t *str);
+int str_append_ch(str_t *str, uint32_t ch);
+
+#endif
diff --git a/include/unicode.h b/include/unicode.h
new file mode 100644
index 0000000000000000000000000000000000000000..96cbd7a1b5dcda964a2b78fb80bbc52483528ea6
--- /dev/null
+++ b/include/unicode.h
@@ -0,0 +1,43 @@
+#ifndef _SCDOC_UNICODE_H
+#define _SCDOC_UNICODE_H
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+// Technically UTF-8 supports up to 6 byte codepoints, but Unicode itself
+// doesn't really bother with more than 4.
+#define UTF8_MAX_SIZE 4
+
+#define UTF8_INVALID 0x80
+
+/**
+ * Grabs the next UTF-8 character and advances the string pointer
+ */
+uint32_t utf8_decode(const char **str);
+
+/**
+ * Encodes a character as UTF-8 and returns the length of that character.
+ */
+size_t utf8_encode(char *str, uint32_t ch);
+
+/**
+ * Returns the size of the next UTF-8 character
+ */
+int utf8_size(const char *str);
+
+/**
+ * Returns the size of a UTF-8 character
+ */
+size_t utf8_chsize(uint32_t ch);
+
+/**
+ * Reads and returns the next character from the file.
+ */
+uint32_t utf8_fgetch(FILE *f);
+
+/**
+ * Writes this character to the file and returns the number of bytes written.
+ */
+size_t utf8_fputch(FILE *f, uint32_t ch);
+
+#endif
diff --git a/include/util.h b/include/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..c68405c4e0e44770f5e0f87a284a9c5f571a3266
--- /dev/null
+++ b/include/util.h
@@ -0,0 +1,16 @@
+#ifndef _SCDOC_PARSER_H
+#define _SCDOC_PARSER_H
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+
+struct parser {
+	FILE *input, *output;
+	int line, col;
+};
+
+void parser_fatal(struct parser *parser, const char *err);
+uint32_t parser_getch(struct parser *parser);
+int roff_macro(struct parser *p, char *cmd, ...);
+
+#endif
diff --git a/meson.build b/meson.build
new file mode 100644
index 0000000000000000000000000000000000000000..d2fab6269040c951bd8b98f5c5184a8505fb11db
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,29 @@
+# TODO: Just use a makefile
+project(
+	'scdoc',
+	'c',
+	license: 'MIT',
+	meson_version: '>=0.43.0',
+	default_options: [
+		'c_std=c99',
+		'warning_level=2',
+		'werror=true',
+	],
+)
+
+add_project_arguments('-Wno-unused-parameter', language: 'c')
+
+executable(
+	'scdoc', [
+		'src/main.c',
+		'src/string.c',
+		'src/utf8_chsize.c',
+		'src/utf8_decode.c',
+		'src/utf8_encode.c',
+		'src/utf8_fgetch.c',
+		'src/utf8_fputch.c',
+		'src/utf8_size.c',
+		'src/util.c',
+	],
+	include_directories: include_directories('include')
+)
diff --git a/scdoc.5.scd b/scdoc.5.scd
new file mode 100644
index 0000000000000000000000000000000000000000..3501b78ae43cff40ad29e31f8462381c6b8400c7
--- /dev/null
+++ b/scdoc.5.scd
@@ -0,0 +1,80 @@
+scdoc(5)
+
+# NAME
+
+scdoc - syntax description for scdoc markup language
+
+# DESCRIPTION
+
+scdoc is a tool designed to make the process of writing man pages more
+friendly. It converts scdoc files into roff macros, which can then be converted
+to man pages or a number of other formats. The syntax is inspired by, but not
+directly taken from, markdown. Input files *must* use the UTF-8 encoding.
+
+# PREAMBLE
+
+Each scdoc file must begin with the following preamble:
+
+	*name*(_section_)
+
+The *name* is the name of the man page you are writing, and _section_ is the
+section you're writing for (see *man*(1) for information on manual sections).
+
+# SECTION HEADERS
+
+Each section of your man page should begin with something similar to the
+following:
+
+	# HEADER NAME
+
+Subsection headers are also understood - use two hashes. Each header must have
+an empty line on either side.
+
+# PARAGRAPHS
+
+Begin a new paragraph with an empty line.
+
+# FORMATTING
+
+Text can be made *bold* or _underlined_ with asterisks and underscores: \*bold\*
+or \_underlined\_.
+
+# INDENTATION
+
+You may indent lines with tab characters ("\t") to indent them by 4 spaces in
+the output. Indented lines may not contain headers.
+
+# LISTS
+
+You may start bulleted lists with dashes, like so:
+
+```
+- Item 1
+- Item 2
+- Item 3
+```
+
+You may also use numbered lists like so:
+
+```
+1. Item 1
+2. Item 2
+3. Item 3
+```
+
+# LITERAL TEXT
+
+You may turn off scdoc formatting and output literal text with escape codes and
+literal blocks. Inserting a \\ into your source will cause the subsequent symbol
+to be treated as a literal and copied directly to the output. You may also make
+blocks of literal syntax like so:
+
+```
+\`\`\`
+_This formatting_ will *not* be interpreted by scdoc.
+\`\`\`
+```
+
+These blocks will be indented one level. Note that literal text is shown
+literally in the man viewer - that is, it's not a means for inserting your own
+roff macros into the output.
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..37c64acc8111a1287c1ab9a46e374747a600fdb0
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,95 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include "string.h"
+#include "unicode.h"
+#include "util.h"
+
+char date[256];
+
+static int parse_section(struct parser *p) {
+	str_t *section = str_create();
+	uint32_t ch;
+	while ((ch = parser_getch(p)) != UTF8_INVALID) {
+		if (isdigit(ch)) {
+			assert(str_append_ch(section, ch) != -1);
+		} else if (ch == ')') {
+			if (!section->str) {
+				break;
+			}
+			int sec = strtol(section->str, NULL, 10);
+			if (sec < 1 || sec > 9) {
+				parser_fatal(p, "Expected section between 1 and 9");
+				break;
+			}
+			str_free(section);
+			return sec;
+		} else {
+			parser_fatal(p, "Expected digit or )");
+			break;
+		}
+	};
+	parser_fatal(p, "Expected manual section");
+	return -1;
+}
+
+static void parse_preamble(struct parser *p) {
+	str_t *name = str_create();
+	int section = -1;
+	uint32_t ch;
+	do {
+		ch = parser_getch(p);
+		if (isalnum(ch)) {
+			assert(str_append_ch(name, ch) != -1);
+		} else if (ch == '(') {
+			section = parse_section(p);
+		} else if (ch == '\n') {
+			if (name->len == 0) {
+				parser_fatal(p, "Expected preamble");
+			}
+			if (section == -1) {
+				parser_fatal(p, "Expected manual section");
+			}
+			char sec[2] = { '0' + section, 0 };
+			roff_macro(p, "TH", name->str, sec, date, NULL);
+			break;
+		}
+	} while (ch != UTF8_INVALID);
+	str_free(name);
+}
+
+static void output_preamble(struct parser *p) {
+	// TODO: Add version here
+	fprintf(p->output, ".\\\" Generated by scdoc\n");
+	fprintf(p->output, ".\\\" Fix weird qutation marks:\n");
+	fprintf(p->output, ".\\\" http://bugs.debian.org/507673\n");
+	fprintf(p->output, ".\\\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html\n");
+	fprintf(p->output, ".ie \\n(.g .ds Aq \\(aq\n");
+	fprintf(p->output, ".el       .ds Aq '\n");
+	fprintf(p->output, ".\\\" Disable hyphenation:\n");
+	roff_macro(p, "nh", NULL);
+	fprintf(p->output, ".\\\" Generated content:\n");
+}
+
+int main(int argc, char **argv) {
+	if (argc > 1) {
+		fprintf(stderr, "Usage: scdoc < input.scd > output.roff");
+		return 1;
+	}
+	time_t now;
+	time(&now);
+	struct tm *now_tm = localtime(&now);
+	strftime(date, sizeof(date), "%F", now_tm);
+	struct parser p = {
+		.input = stdin,
+		.output = stdout,
+		.line = 1,
+		.col = 1
+	};
+	output_preamble(&p);
+	parse_preamble(&p);
+	return 0;
+}
diff --git a/src/string.c b/src/string.c
new file mode 100644
index 0000000000000000000000000000000000000000..79beae8e1d08f395d010d294a26a0ae135103990
--- /dev/null
+++ b/src/string.c
@@ -0,0 +1,55 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include "string.h"
+#include "unicode.h"
+
+static void sanity_check(str_t *str) {
+	if (str->str == NULL) {
+		str->str = malloc(16);
+		str->size = 16;
+		str->len = 0;
+		str->str[0] = '\0';
+	}
+}
+
+static int ensure_capacity(str_t *str, size_t len) {
+	if (len + 1 >= str->size) {
+		char *new = realloc(str->str, str->size * 2);
+		if (!new) {
+			return 0;
+		}
+		str->str = new;
+		str->size *= 2;
+	}
+	return 1;
+}
+
+str_t *str_create() {
+	return calloc(sizeof(str_t), 1);
+}
+
+void str_free(str_t *str) {
+	if (!str) return;
+	free(str->str);
+	free(str);
+}
+
+void str_reset(str_t *str) {
+	str->len = 0;
+	str->str[0] = '\0';
+}
+
+int str_append_ch(str_t *str, uint32_t ch) {
+	int size = utf8_chsize(ch);
+	if (size <= 0) {
+		return -1;
+	}
+	sanity_check(str);
+	if (!ensure_capacity(str, str->len + size)) {
+		return -1;
+	}
+	utf8_encode(&str->str[str->len], ch);
+	str->len += size;
+	str->str[str->len] = '\0';
+	return size;
+}
diff --git a/src/utf8_chsize.c b/src/utf8_chsize.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bff491595fe8a94793b14c1a2657dccb95ebeb2
--- /dev/null
+++ b/src/utf8_chsize.c
@@ -0,0 +1,14 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+size_t utf8_chsize(uint32_t ch) {
+	if (ch < 0x80) {
+		return 1;
+	} else if (ch < 0x800) {
+		return 2;
+	} else if (ch < 0x10000) {
+		return 3;
+	}
+	return 4;
+}
diff --git a/src/utf8_decode.c b/src/utf8_decode.c
new file mode 100644
index 0000000000000000000000000000000000000000..fcbbb5ed8e9ba45f6441808614eeb2368a9bdcbb
--- /dev/null
+++ b/src/utf8_decode.c
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+uint8_t masks[] = {
+	0x7F,
+	0x1F,
+	0x0F,
+	0x07,
+	0x03,
+	0x01
+};
+
+uint32_t utf8_decode(const char **char_str) {
+	uint8_t **s = (uint8_t **)char_str;
+
+	uint32_t cp = 0;
+	if (**s < 128) {
+		// shortcut
+		cp = **s;
+		++*s;
+		return cp;
+	}
+	int size = utf8_size((char *)*s);
+	if (size == -1) {
+		++*s;
+		return UTF8_INVALID;
+	}
+	uint8_t mask = masks[size - 1];
+	cp = **s & mask;
+	++*s;
+	while (--size) {
+		cp <<= 6;
+		cp |= **s & 0x3f;
+		++*s;
+	}
+	return cp;
+}
diff --git a/src/utf8_encode.c b/src/utf8_encode.c
new file mode 100644
index 0000000000000000000000000000000000000000..d1c32bbe34fed5b09893d738888e6b8dd3f48dc2
--- /dev/null
+++ b/src/utf8_encode.c
@@ -0,0 +1,30 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+size_t utf8_encode(char *str, uint32_t ch) {
+	size_t len = 0;
+	uint8_t first;
+
+	if (ch < 0x80) {
+		first = 0;
+		len = 1;
+	} else if (ch < 0x800) {
+		first = 0xc0;
+		len = 2;
+	} else if (ch < 0x10000) {
+		first = 0xe0;
+		len = 3;
+	} else {
+		first = 0xf0;
+		len = 4;
+	}
+
+	for (size_t i = len - 1; i > 0; --i) {
+		str[i] = (ch & 0x3f) | 0x80;
+		ch >>= 6;
+	}
+
+	str[0] = ch | first;
+	return len;
+}
diff --git a/src/utf8_fgetch.c b/src/utf8_fgetch.c
new file mode 100644
index 0000000000000000000000000000000000000000..8fafa55b318ec7f2870d176bc0b9aa26bac32165
--- /dev/null
+++ b/src/utf8_fgetch.c
@@ -0,0 +1,21 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "unicode.h"
+
+uint32_t utf8_fgetch(FILE *f) {
+	char buffer[UTF8_MAX_SIZE];
+	int c = fgetc(f);
+	if (c == EOF) {
+		return UTF8_INVALID;
+	}
+	buffer[0] = (char)c;
+	int size = utf8_size(buffer);
+	if (size > 1) {
+		int amt = fread(&buffer[1], 1, size - 1, f);
+		if (amt != size - 1) {
+			return UTF8_INVALID;
+		}
+	}
+	const char *ptr = buffer;
+	return utf8_decode(&ptr);
+}
diff --git a/src/utf8_fputch.c b/src/utf8_fputch.c
new file mode 100644
index 0000000000000000000000000000000000000000..650c6285b04363bd61f79f21f291354e00793c8b
--- /dev/null
+++ b/src/utf8_fputch.c
@@ -0,0 +1,10 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "unicode.h"
+
+size_t utf8_fputch(FILE *f, uint32_t ch) {
+	char buffer[UTF8_MAX_SIZE];
+	char *ptr = buffer;
+	size_t size = utf8_encode(ptr, ch);
+	return fwrite(&buffer, 1, size, f);
+}
diff --git a/src/utf8_size.c b/src/utf8_size.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ac2d3eec705c1d6d1ccb6be2da27771c2bd3d7d
--- /dev/null
+++ b/src/utf8_size.c
@@ -0,0 +1,27 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+struct {
+	uint8_t mask;
+	uint8_t result;
+	int octets;
+} sizes[] = {
+	{ 0x80, 0x00, 1 },
+	{ 0xE0, 0xC0, 2 },
+	{ 0xF0, 0xE0, 3 },
+	{ 0xF8, 0xF0, 4 },
+	{ 0xFC, 0xF8, 5 },
+	{ 0xFE, 0xF8, 6 },
+	{ 0x80, 0x80, -1 },
+};
+
+int utf8_size(const char *s) {
+	uint8_t c = (uint8_t)*s;
+	for (size_t i = 0; i < sizeof(sizes) / 2; ++i) {
+		if ((c & sizes[i].mask) == sizes[i].result) {
+			return sizes[i].octets;
+		}
+	}
+	return -1;
+}
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 0000000000000000000000000000000000000000..bb9df685fb4baf7e28ccc6e3c0a134abee0f7763
--- /dev/null
+++ b/src/util.c
@@ -0,0 +1,50 @@
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "unicode.h"
+#include "util.h"
+
+void parser_fatal(struct parser *parser, const char *err) {
+	fprintf(stderr, "Error at %d:%d: %s\n",
+			parser->line, parser->col, err);
+	fclose(parser->input);
+	fclose(parser->output);
+	exit(1);
+}
+
+uint32_t parser_getch(struct parser *parser) {
+	uint32_t ch = utf8_fgetch(parser->input);
+	if (ch == '\n') {
+		parser->col = 0;
+		++parser->line;
+	} else {
+		++parser->col;
+	}
+	return ch;
+}
+
+int roff_macro(struct parser *p, char *cmd, ...) {
+	FILE *f = p->output;
+	int l = fprintf(f, ".%s", cmd);
+	va_list ap;
+	va_start(ap, cmd);
+	const char *arg;
+	while ((arg = va_arg(ap, const char *))) {
+		fputc(' ', f);
+		fputc('"', f);
+		while (*arg) {
+			uint32_t ch = utf8_decode(&arg);
+			if (ch == '"') {
+				fputc('\\', f);
+				++l;
+			}
+			l += utf8_fputch(f, ch);
+		}
+		fputc('"', f);
+		l += 3;
+	}
+	va_end(ap);
+	fputc('\n', f);
+	return l + 1;
+}

-- 
2.48.1