-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #115 from rhpvorderman/simpleasciicheck
Simplify ASCII check
- Loading branch information
Showing
4 changed files
with
44 additions
and
103 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,48 @@ | ||
#define ASCII_MASK_8BYTE 0x8080808080808080ULL | ||
#define ASCII_MASK_1BYTE 0x80 | ||
|
||
#include <stddef.h> | ||
#include <stdint.h> | ||
#ifdef __SSE2__ | ||
#include "emmintrin.h" | ||
#endif | ||
|
||
#define ASCII_MASK_8BYTE 0x8080808080808080ULL | ||
#define ASCII_MASK_1BYTE 0x80 | ||
|
||
/** | ||
* @brief Check if a string of given length only contains ASCII characters. | ||
* | ||
* @param string A char pointer to the start of the string. | ||
* @param length The length of the string. This funtion does not check for | ||
* terminating NULL bytes. | ||
* @returns 1 if the string is ASCII-only, 0 otherwise. | ||
*/ | ||
static int | ||
string_is_ascii(char * string, size_t length) { | ||
size_t n = length; | ||
string_is_ascii(const char * string, size_t length) { | ||
// By performing bitwise OR on all characters in 8-byte chunks (16-byte | ||
// with SSE2) we can | ||
// determine ASCII status in a non-branching (except the loops) fashion. | ||
uint64_t all_chars = 0; | ||
char * char_ptr = string; | ||
// The first loop aligns the memory address. Char_ptr is cast to a size_t | ||
// to return the memory address. Uint64_t is 8 bytes long, and the processor | ||
// handles this better when its address is a multiplier of 8. This loops | ||
// handles the first few bytes that are not on such a multiplier boundary. | ||
while ((size_t)char_ptr % sizeof(uint64_t) && n != 0) { | ||
all_chars |= *char_ptr; | ||
char_ptr += 1; | ||
n -= 1; | ||
const char *cursor = string; | ||
const char *string_end_ptr = string + length; | ||
const char *string_8b_end_ptr = string_end_ptr - sizeof(uint64_t); | ||
int non_ascii_in_vec = 0; | ||
#ifdef __SSE2__ | ||
const char *string_16b_end_ptr = string_end_ptr - sizeof(__m128i); | ||
__m128i vec_all_chars = _mm_setzero_si128(); | ||
while (cursor < string_16b_end_ptr) { | ||
__m128i loaded_chars = _mm_loadu_si128((__m128i *)cursor); | ||
vec_all_chars = _mm_or_si128(loaded_chars, vec_all_chars); | ||
cursor += sizeof(__m128i); | ||
} | ||
uint64_t *longword_ptr = (uint64_t *)char_ptr; | ||
while (n >= sizeof(uint64_t)) { | ||
all_chars |= *longword_ptr; | ||
longword_ptr += 1; | ||
n -= sizeof(uint64_t); | ||
non_ascii_in_vec = _mm_movemask_epi8(vec_all_chars); | ||
#endif | ||
|
||
while (cursor < string_8b_end_ptr) { | ||
all_chars |= *(uint64_t *)cursor; | ||
cursor += sizeof(uint64_t); | ||
} | ||
char_ptr = (char *)longword_ptr; | ||
while (n != 0) { | ||
all_chars |= *char_ptr; | ||
char_ptr += 1; | ||
n -= 1; | ||
while (cursor < string_end_ptr) { | ||
all_chars |= *cursor; | ||
cursor += 1; | ||
} | ||
return !(all_chars & ASCII_MASK_8BYTE); | ||
return !(non_ascii_in_vec + (all_chars & ASCII_MASK_8BYTE)); | ||
} |
This file was deleted.
Oops, something went wrong.