From cf413fb1d821d944e0202b16400950191b9c9e13 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Sun, 28 Jul 2024 16:00:43 +0300
Subject: [PATCH] Read assertion from file with offsets

---
 src/jou_compiler.h | 11 +++++-
 src/parse.c        | 90 ++++++++++++++++++++++++++++------------------
 src/tokenize.c     | 14 +++++++-
 3 files changed, 79 insertions(+), 36 deletions(-)

diff --git a/src/jou_compiler.h b/src/jou_compiler.h
index c505e4a4..3284dd49 100644
--- a/src/jou_compiler.h
+++ b/src/jou_compiler.h
@@ -91,7 +91,6 @@ struct Token {
         TOKEN_OPERATOR,
         TOKEN_END_OF_FILE,  // Marks the end of an array of Token
     } type;
-    Location location;
     union {
         int16_t short_value; // TOKEN_SHORT
         int32_t int_value;  // TOKEN_INT
@@ -102,6 +101,16 @@ struct Token {
         char name[100];  // TOKEN_NAME and TOKEN_KEYWORD. Also TOKEN_DOUBLE & TOKEN_FLOAT (LLVM wants a string anyway)
         char operator[4];  // TOKEN_OPERATOR
     } data;
+
+    /*
+    Contains only the line number, not column. In some cases you can use
+    start_offset and end_offset to work around that.
+    */
+    Location location;
+
+    // Number of bytes from start of file to start/end of this token.
+    long start_offset;
+    long end_offset;
 };
 
 // Constants can appear in AST and also compilation steps after AST.
diff --git a/src/parse.c b/src/parse.c
index a821f33a..153ed818 100644
--- a/src/parse.c
+++ b/src/parse.c
@@ -725,42 +725,65 @@ static enum AstStatementKind determine_the_kind_of_a_statement_that_starts_with_
     return AST_STMT_EXPRESSION_STATEMENT;
 }
 
-// TODO: this function is just bad...
-static char *read_assertion_from_file(Location start, Location end)
+static char *read_assertion_from_file(Location error_location, const Token *start, const Token *end)
 {
-    assert(start.filename == end.filename);
-    FILE *f = fopen(start.filename, "rb");
-    assert(f);
-
-    char line[1024];
-    int lineno = 1;
-    while (lineno < start.lineno) {
-        fgets(line, sizeof line, f);
-        lineno++;
-    }
+    FILE *f = fopen(error_location.filename, "rb");
+    if (!f)
+        goto error;
+
+    List(char) result = {0};
+
+    long ostart, oend;  // offsets within file to include
+    for (const Token *t = start; t < end; t++) {
+        assert(t->start_offset < t->end_offset);
+
+        if (t == start || t->location.lineno != t[-1].location.lineno) {
+            // First token of a new line
+            ostart = t->start_offset;
+            oend = t->end_offset;
+        } else {
+            // Include more tokens from the line of code so that this token is added too.
+            // We cannot include the entire line because it might contain comments.
+            assert(oend <= t->start_offset);
+            oend = t->end_offset;
+        }
 
-    List(char) str = {0};
-    while (lineno <= end.lineno) {
-        memset(line, 0, sizeof line);
-        fgets(line, sizeof line, f);
-        lineno++;
-
-        if (strstr(line, "#"))
-            *strstr(line, "#") = '\0';
-        trim_whitespace(line);
-        // Add spaces between the lines, but not after '(' or before ')'
-        if (line[0] != ')' && str.len >= 1 && str.ptr[str.len-1] != '(')
-            AppendStr(&str, " ");
-        AppendStr(&str, line);
+        if (t == end-1 || t[0].location.lineno != t[1].location.lineno) {
+            // Last token of a line. Read code from file.
+            char *line = malloc(oend - ostart + 1);
+            if (!line)
+                goto error;
+            if (fseek(f, ostart, SEEK_SET) < 0)
+                goto error;
+            if (result.len > 0)
+                Append(&result, '\n');
+            for (long i = ostart; i < oend; i++) {
+                int c = fgetc(f);
+                if (c == EOF || c == '\r' || c == '\n')
+                    goto error;
+                Append(&result, (char)c);
+            }
+        }
     }
 
-    fclose(f);
-    Append(&str, '\0');
+    /*
+    Join lines with spaces, but do not put spaces just after '(' or before ')'.
+    This makes multiline asserts nicer, so "assert (\n    foo and bar\n)"
+    shows "foo and bar" as the assert condition.
+    */
+    Append(&result, '\0');
+    char *p;
+    while ((p = strstr(result.ptr, "\n"))) {
+        if ((p > result.ptr && p[-1] == '(') || (p[1] == ')')) {
+            memmove(p, p+1, strlen(p));  // delete newline character at p
+        } else {
+            *p = ' ';  // join lines with a space
+        }
+    }
+    return result.ptr;
 
-    if(!strncmp(str.ptr, "assert",6))
-        memmove(str.ptr, &str.ptr[6], strlen(&str.ptr[6]) + 1);
-    trim_whitespace(str.ptr);
-    return str.ptr;
+error:
+    fail(error_location, "internal error: cannot read assertion text from file");
 }
 
 // does not eat a trailing newline
@@ -777,10 +800,9 @@ static AstStatement parse_oneline_statement(ParserState *ps)
     } else if (is_keyword(ps->tokens, "assert")) {
         ps->tokens++;
         result.kind = AST_STMT_ASSERT;
-        Location start = ps->tokens->location;
+        const Token *condstart = ps->tokens;
         result.data.assertion.condition = parse_expression(ps);
-        Location end = ps->tokens->location;
-        result.data.assertion.condition_str = read_assertion_from_file(start, end);
+        result.data.assertion.condition_str = read_assertion_from_file(result.location, condstart, ps->tokens);
     } else if (is_keyword(ps->tokens, "pass")) {
         ps->tokens++;
         result.kind = AST_STMT_PASS;
diff --git a/src/tokenize.c b/src/tokenize.c
index ba9e2907..70014562 100644
--- a/src/tokenize.c
+++ b/src/tokenize.c
@@ -420,10 +420,20 @@ static void handle_parentheses(struct State *st, const struct Token *t)
     }
 }
 
+// Returns the offset of the current location as number of bytes from start of file.
+static long get_offset(const struct State *st)
+{
+    long off = ftell(st->f);
+    if (off < 0)
+        fail(st->location, "internal error: ftell() failed");
+
+    return off - st->pushback.len;
+}
+
 static Token read_token(struct State *st)
 {
     while(1) {
-        Token t = { .location = st->location };
+        Token t = { .location = st->location, .start_offset = get_offset(st) };
         char c = read_byte(st);
 
         switch(c) {
@@ -482,6 +492,8 @@ static Token read_token(struct State *st)
             }
             break;
         }
+
+        t.end_offset = get_offset(st);
         return t;
     }
 }