Table of Contents
- Introduction
- Loop Unroll
- Branch Prediction
- Inline C Routines
- Efficient Number Converter
- Efficient Memory Manager
- Table Lookup
- Zero copy
- Inline assemble indication
Introduction
This article will explore the reasons for the high performance of YYJSON.
Loop unroll
#define repeat2(x) { x x }
#define repeat3(x) { x x x }
#define repeat4(x) { x x x x }
#define repeat8(x) { x x x x x x x x }
#define repeat16(x) { x x x x x x x x x x x x x x x x }
#define repeat2_incr(x) { x(0) x(1) }
#define repeat4_incr(x) { x(0) x(1) x(2) x(3) }
#define repeat8_incr(x) { x(0) x(1) x(2) x(3) x(4) x(5) x(6) x(7) }
#define repeat16_incr(x) { x(0) x(1) x(2) x(3) x(4) x(5) x(6) x(7) \
x(8) x(9) x(10) x(11) x(12) x(13) x(14) x(15) }
#define repeat_in_1_18(x) { x(1) x(2) x(3) x(4) x(5) x(6) x(7) x(8) \
x(9) x(10) x(11) x(12) x(13) x(14) x(15) x(16) \
x(17) x(18) }
// see function read_string
static_inline bool read_string(u8 **ptr,
u8 *lst,
bool inv,
yyjson_val *val,
const char **msg) {
...
}
Branch prediction
Manually prediction instructions
/* Macros used to provide branch prediction information for compiler. */
#undef likely
#define likely(x) yyjson_likely(x)
#undef unlikely
#define unlikely(x) yyjson_unlikely(x)
high probability cases first
- In the
read_string
function, we can see that it test the closing"
right after skip ascii. Because this is the most common case. - In the
read_string
function, we can see that it test the three bytes utf8 first, because it is the most common case.
Inline C routines
/* Macros used to provide inline information for compiler. */
#undef static_inline
#define static_inline static yyjson_inline
#undef static_noinline
#define static_noinline static yyjson_noinline
Efficient number converter
/**
Read a JSON number.
1. This function assume that the floating-point number is in IEEE-754 format.
2. This function support uint64/int64/double number. If an integer number
cannot fit in uint64/int64, it will returns as a double number. If a double
number is infinite, the return value is based on flag.
3. This function (with inline attribute) may generate a lot of instructions.
*/
static_inline bool read_number(u8 **ptr,
u8 **pre,
yyjson_read_flag flg,
yyjson_val *val,
const char **msg)
Efficient memory manager
Memory allocation is always is a hot spot of the software. Memory pool is a common skill to reduce the overhead of memory allocation Go to https://github.com/ibireme/yyjson/blob/master/src/yyjson.c#L1036 for more details.
Table lookup
Convert hex char to num
static const u8 hex_conv_table[256] = {
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0
};
/**
Scans an escaped character sequence as a UTF-16 code unit (branchless).
e.g. "\\u005C" should pass "005C" as `cur`.
This requires the string has 4-byte zero padding.
*/
static_inline bool read_hex_u16(const u8 *cur, u16 *val) {
u16 c0, c1, c2, c3, t0, t1;
c0 = hex_conv_table[cur[0]];
c1 = hex_conv_table[cur[1]];
c2 = hex_conv_table[cur[2]];
c3 = hex_conv_table[cur[3]];
t0 = (u16)((c0 << 8) | c2);
t1 = (u16)((c1 << 8) | c3);
*val = (u16)((t0 << 4) | t1);
return ((t0 | t1) & (u16)0xF0F0) == 0;
}
The function is also very interesting. There is not if
when testing if the \uabcd
is valid.
Char type judgement
/** Character type table (generate with misc/make_tables.c) */
static const char_type char_table[256] = {
0x44, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
0x04, 0x05, 0x45, 0x04, 0x04, 0x45, 0x04, 0x04,
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x20,
0x82, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82, 0x82,
0x82, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x10, 0x04, 0x00, 0x00, 0x00,
0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08
};
/** Match a character with specified type. */
static_inline bool char_is_type(u8 c, char_type type) {
return (char_table[c] & type) != 0;
}
/** Match a whitespace: ' ', '\\t', '\\n', '\\r'. */
static_inline bool char_is_space(u8 c) {
return char_is_type(c, (char_type)CHAR_TYPE_SPACE);
}
....
Zero copy
From the function read_string
, we can found that it first try to skip ascii and utf8 characters.
Because most of the time, the string are compose of ascii and characters.
For example: “Hellow, world!”
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
static_inline bool read_string(u8 **ptr,
u8 *lst,
bool inv,
yyjson_val *val,
const char **msg) {
skip_ascii:
/* Most strings have no escaped characters, so we can jump them quickly. */
skip_ascii_begin:
#define expr_jump(i) \
if (likely(!char_is_ascii_stop(src[i]))) {} \
else goto skip_ascii_stop##i;
#define expr_stop(i) \
skip_ascii_stop##i: \
src += i; \
goto skip_ascii_end;
repeat16_incr(expr_jump)
src += 16;
goto skip_ascii_begin;
repeat16_incr(expr_stop)
#undef expr_jump
#undef expr_stop
if (likely(*src == '"')) {
val->tag = ((u64)(src - cur) << YYJSON_TAG_BIT) |
(u64)(YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC);
val->uni.str = (const char *)cur;
*src = '\0';
*end = src + 1;
return true;
}
skip_ascii_end:
/*
GCC may store src[i] in a register at each line of expr_jump(i) above.
These instructions are useless and will degrade performance.
This inline asm is a hint for gcc: "the memory has been modified,
do not cache it".
MSVC, Clang, ICC can generate expected instructions without this hint.
*/
#if YYJSON_IS_REAL_GCC
__asm__ volatile("":"=m"(*src));
#endif
if (likely(*src == '"')) {
val->tag = ((u64)(src - cur) << YYJSON_TAG_BIT) |
(u64)(YYJSON_TYPE_STR | YYJSON_SUBTYPE_NOESC);
val->uni.str = (const char *)cur;
*src = '\0';
*end = src + 1;
return true;
}
skip_utf8:
....
// until here
dst = src;
copy_escape:
}
Inline assembler indication
Why can he/she know the behavior of the GCC? Maybe he has lookup the assembler code of the function.
/*
GCC may store src[i] in a register at each line of expr_jump(i) above.
These instructions are useless and will degrade performance.
This inline asm is a hint for gcc: "the memory has been modified,
do not cache it".
MSVC, Clang, ICC can generate expected instructions without this hint.
*/
#if YYJSON_IS_REAL_GCC
__asm__ volatile("":"=m"(*src));
#endif