Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster lex_identifier #1746

Merged
merged 1 commit into from
Oct 30, 2023
Merged

Conversation

haldun
Copy link
Collaborator

@haldun haldun commented Oct 28, 2023

I have introduced some caching in lex_identifier to make it a bit faster. I tested the changes on parsing a 10M lines of Ruby file. I see a %6 decrease in time.

This is the program that I used to measure

#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/mman.h>
#include <time.h>

#include "prism.h"

int main(int argc, char const *argv[])
{
    __auto_type fd = open("/tmp/b.rb", O_RDONLY);
    struct stat st;
    __auto_type rc = fstat(fd, &st);
    size_t size = st.st_size;
    const uint8_t * region = mmap(
        0, size,
        PROT_READ, MAP_FILE|MAP_PRIVATE,
        fd, 0
    );
    __auto_type parser = malloc(sizeof(pm_parser_t));
    __auto_type start = (float)clock()/CLOCKS_PER_SEC;
    pm_parser_init(parser, region, size, NULL);
    __auto_type node = pm_parse(parser);
    __auto_type end = (float)clock()/CLOCKS_PER_SEC;
    __auto_type timeElapsed = end - start;
    printf("%f\n", timeElapsed);
    __auto_type unmap_result = munmap((void *)region, size);
    close(fd);
    return 0;
}

src/prism.c Outdated
Comment on lines 6060 to 6061
if (parser->current.end < end) {
if (((parser->current.end + 1 >= end) || (parser->current.end[1] != '=')) && (match(parser, '!') || match(parser, '?'))) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this use current_end too? It seems more consistent to always use that variable rather than a mix of it & not.

Copy link
Member

@eregon eregon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, so the extra field accesses were causing the overhead here?
I wonder why the C compiler doesn't/cannot optimize that.

@haldun
Copy link
Collaborator Author

haldun commented Oct 28, 2023

I will dig deeper to this. I am not totally sure about the difference.

@haldun
Copy link
Collaborator Author

haldun commented Oct 30, 2023

It looks like the difference comes from accessing parser->end and accessing/mutating parser->current.end in the while loop.

Before

lex_identifier:
        push    r14
        push    r13
        mov     r13d, esi
        push    r12
        push    rbp
        mov     rbp, rdi
        push    rbx
        mov     rbx, QWORD PTR [rdi+320]
        mov     r12, QWORD PTR [rdi+272]
        cmp     rbx, r12
        jnb     .L1358
.L1356:
        cmp     BYTE PTR [rbp+666], 0
        jne     .L1549
        movzx   eax, BYTE PTR [rbx]
        test    al, al
        js      .L1363
        movzx   edx, al
        test    BYTE PTR pm_encoding_unicode_table[rdx], 2
        jne     .L1361
        cmp     al, 95
        je      .L1361
.L1362:
        mov     r14, QWORD PTR [rbp+312]
        mov     rdx, rbx
        mov     eax, DWORD PTR [rbp+0]
        sub     rdx, r14
        cmp     rbx, r12
        jnb     .L1472
        lea     rcx, [rbx+1]
        cmp     rcx, r12
        jb      .L1550
.L1365:
        movzx   esi, BYTE PTR [rbx]
        cmp     sil, 33
        je      .L1368
        cmp     sil, 63
        je      .L1368
        test    al, -128
        je      .L1367
        cmp     rcx, r12
        jb      .L1551
.L1378:
        cmp     sil, 61
        je      .L1552
.L1367:
        test    eax, 1032
        je      .L1493
        test    r13b, r13b
        je      .L1380
.L1493:
        test    al, 48
        je      .L1472
.L1380:
        cmp     BYTE PTR [rbx], 58
        jne     .L1472
        cmp     rcx, r12
        jb      .L1553
.L1382:
        mov     DWORD PTR [rbp+0], 2064
        cmp     BYTE PTR [rbx], 58
        jne     .L1375
.L1542:
        mov     QWORD PTR [rbp+320], rcx
        jmp     .L1375
.L1360:
        mov     rbx, QWORD PTR [rbp+320]
        mov     r12, QWORD PTR [rbp+272]
.L1361:
        add     rbx, 1
        mov     QWORD PTR [rbp+320], rbx
        cmp     rbx, r12
        jb      .L1356
.L1358:
        mov     r14, QWORD PTR [rbp+312]
        mov     rdx, rbx
        mov     eax, DWORD PTR [rbp+0]
        sub     rdx, r14
.L1472:
        cmp     eax, 256
        je      .L1385
        cmp     rdx, 12
        ja      .L1385
        jmp     [QWORD PTR .L1387[0+rdx*8]]
.L1387:
        .quad   .L1385
        .quad   .L1385
        .quad   .L1393
        .quad   .L1392
        .quad   .L1391
        .quad   .L1390
        .quad   .L1389
        .quad   .L1385
        .quad   .L1388
        .quad   .L1385
        .quad   .L1385
        .quad   .L1385
        .quad   .L1386
.L1549:
        mov     rsi, r12
        mov     rdi, rbx
        sub     rsi, rbx
        call    [QWORD PTR [rbp+472]]
        test    rax, rax
        jne     .L1360
        movzx   eax, BYTE PTR [rbx]
        mov     r12, QWORD PTR [rbp+272]
        mov     rbx, QWORD PTR [rbp+320]
        cmp     al, 95
        je      .L1361
        test    al, al
        jns     .L1362
        add     rbx, 1
        mov     QWORD PTR [rbp+320], rbx
        cmp     rbx, r12
        jb      .L1356
        jmp     .L1358
.L1363:
        mov     rsi, r12
        mov     rdi, rbx
        sub     rsi, rbx
        call    pm_encoding_utf_8_alnum_char
        mov     rbx, QWORD PTR [rbp+320]
        mov     r12, QWORD PTR [rbp+272]
        add     rbx, 1
        mov     QWORD PTR [rbp+320], rbx
        cmp     rbx, r12
        jb      .L1356
        jmp     .L1358
.L1550:
        cmp     BYTE PTR [rbx+1], 61
        jne     .L1365
        test    al, -128
        je      .L1367
.L1366:
        lea     rsi, [rbx+2]
        cmp     rsi, r12
        jnb     .L1367
        cmp     BYTE PTR [rbx+2], 62
        jne     .L1367
        movzx   esi, BYTE PTR [rbx]
        jmp     .L1378
.L1393:
        lea     rdx, [r14+2]
        cmp     r12, rdx
        jnb     .L1554
.L1385:
        mov     rsi, r12
        mov     rdi, r14
        sub     rsi, r14
        call    [QWORD PTR [rbp+480]]
        mov     edx, 29
        test    al, al
        je      .L1379
.L1355:
        pop     rbx
        mov     eax, edx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        ret
.L1552:
        mov     QWORD PTR [rbp+320], rcx
.L1379:
        mov     edx, 55
        pop     rbx
        pop     rbp
        mov     eax, edx
        pop     r12
        pop     r13
        pop     r14
        ret

After

lex_identifier:
        push    r14
        push    r13
        mov     r13d, esi
        push    r12
        push    rbp
        mov     rbp, rdi
        push    rbx
        mov     r12, QWORD PTR [rdi+272]
        mov     rbx, QWORD PTR [rdi+320]
        mov     r14, QWORD PTR [rdi+312]
        cmp     rbx, r12
        jnb     .L1463
.L1272:
        cmp     BYTE PTR [rbp+666], 0
        jne     .L1464
        movzx   eax, BYTE PTR [rbx]
        test    al, al
        js      .L1278
        movzx   edx, al
        test    BYTE PTR pm_encoding_unicode_table[rdx], 2
        jne     .L1279
        cmp     al, 95
        je      .L1279
.L1277:
        mov     rdx, rbx
        mov     QWORD PTR [rbp+320], rbx
        mov     eax, DWORD PTR [rbp+0]
        sub     rdx, r14
        cmp     rbx, r12
        jnb     .L1390
        lea     rcx, [rbx+1]
        cmp     rcx, r12
        jb      .L1465
.L1281:
        mov     rdi, QWORD PTR [rbp+272]
        cmp     rbx, rdi
        jnb     .L1282
        movzx   esi, BYTE PTR [rbx]
        cmp     sil, 33
        je      .L1283
        cmp     sil, 63
        je      .L1283
.L1282:
        test    al, -128
        je      .L1292
        mov     rdi, QWORD PTR [rbp+272]
        mov     r8, rdi
        cmp     rcx, rdi
        jnb     .L1293
        movzx   esi, BYTE PTR [rbx+1]
        mov     r9d, esi
        and     r9d, -65
        cmp     r9b, 62
        je      .L1292
        cmp     sil, 61
        je      .L1466
.L1293:
        cmp     rbx, rdi
        jnb     .L1390
        movzx   esi, BYTE PTR [rbx]
        cmp     sil, 61
        je      .L1467
        test    eax, 1032
        je      .L1388
        test    r13b, r13b
        je      .L1389
.L1388:
        mov     r8, rdi
        test    al, 48
        jne     .L1389
.L1390:
        cmp     eax, 256
        je      .L1301
.L1468:
        cmp     rdx, 12
        ja      .L1301
        jmp     [QWORD PTR .L1303[0+rdx*8]]
.L1303:
        .quad   .L1301
        .quad   .L1301
        .quad   .L1309
        .quad   .L1308
        .quad   .L1307
        .quad   .L1306
        .quad   .L1305
        .quad   .L1301
        .quad   .L1304
        .quad   .L1301
        .quad   .L1301
        .quad   .L1301
        .quad   .L1302
.L1278:
        mov     rsi, QWORD PTR [rbp+272]
        mov     rdi, rbx
        sub     rsi, rbx
        call    pm_encoding_utf_8_alnum_char
.L1279:
        add     rbx, 1
        cmp     r12, rbx
        jne     .L1272
.L1274:
        mov     eax, DWORD PTR [rbp+0]
        mov     rdx, r12
        mov     QWORD PTR [rbp+320], r12
        sub     rdx, r14
        cmp     eax, 256
        jne     .L1468
.L1301:
        mov     rsi, r12
        mov     rdi, r14
        sub     rsi, r14
        call    [QWORD PTR [rbp+480]]
        mov     edx, 29
        test    al, al
        jne     .L1271
.L1295:
        mov     edx, 55
.L1271:
        pop     rbx
        mov     eax, edx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        ret

https://godbolt.org/z/cEx7qh5sf

@eregon
Copy link
Member

eregon commented Oct 30, 2023

Interesting, maybe the C compiler thinks assigning parser->current.end can change parser->end.

const uint8_t *end; // the pointer to the end of the source

Would const uint8_t *const end; help there? (not sure if C compilers trust const though)

Or potentially some use of restrict if that's not enough.

In any case I think this is fine to merge given the speedup and little changes needed.

Copy link
Collaborator

@kddnewton kddnewton left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great work! One additional small request please

src/prism.c Outdated
@@ -6163,7 +6168,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
}
}

return parser->encoding.isupper_char(parser->current.start, parser->end - parser->current.start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more change that should help - instead of going through the function pointer can you check if the encoding is changed and otherwise use the default encoding function?

Suggested change
return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
if (parser->encoding_changed) {
return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
} else {
return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
}

Note this will require making pm_encoding_utf_8_isupper_char non-static.

@haldun haldun force-pushed the haldun/faster-lex-identifier branch from 17898a4 to c44e349 Compare October 30, 2023 14:45
@haldun haldun force-pushed the haldun/faster-lex-identifier branch from c44e349 to e44a9ae Compare October 30, 2023 14:47
@kddnewton kddnewton merged commit 4bdafcc into ruby:main Oct 30, 2023
45 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants