@@ -48,6 +48,12 @@ let # Feature detection -- copied from ScanByte.jl
48
48
features = split (unsafe_string (features_cstring), ' ,' )
49
49
Libc. free (features_cstring)
50
50
51
+ # prefix_xor works like this: it goes through the bits of the input UInt from least significant to most significant
52
+ # and it xors the current bit with the previous one. This means that it starts producing 0 until it meets the first 1, then
53
+ # it starts producing 1s until it meets the next 1, then it starts producing 0s again, etc.
54
+ # Example:
55
+ # 0b00001000010
56
+ # -> 0b00000111110
51
57
@eval if _AVOID_PLATFORM_SPECIFIC_LLVM_CODE || ! any (x-> occursin (" clmul" , x), $ (features))
52
58
@inline function prefix_xor (q)
53
59
mask = q ⊻ (q << 1 )
@@ -59,6 +65,7 @@ let # Feature detection -- copied from ScanByte.jl
59
65
return mask
60
66
end
61
67
else
68
+ # Cool explainer on carryless multiplication: https://wunkolo.github.io/post/2020/05/pclmulqdq-tricks/
62
69
function carrylessmul (a:: NTuple{2,VecElement{UInt64}} , b:: NTuple{2,VecElement{UInt64}} )
63
70
ccall (" llvm.x86.pclmulqdq" , llvmcall, NTuple{2 ,VecElement{UInt64}}, (NTuple{2 ,VecElement{UInt64}}, NTuple{2 ,VecElement{UInt64}}, UInt8), a, b, 0 )
64
71
end
@@ -123,13 +130,38 @@ end
123
130
# These must be respected in the `_find_newlines_kernel!` and `_find_newlines_generic!`
124
131
# All other cases are unambiguous, i.e. we can tell if we are inside a string or not,
125
132
# and if we are, we can tell if we are on an escape or not.
133
+ """
134
+ Lexer{E,OQ,CQ,NL,IO_t}
135
+
136
+ Lexer(io, escapechar, openquotechar, closequotechar, newline) -> Lexer{E,OQ,CQ,NL,IO_t}
137
+ Lexer(io, nothing, newline) -> Lexer{Nothing,Nothing,Nothing,NL,IO_t}
138
+
139
+ A stateful lexer type for newline detection. Use with the `find_newlines!` function.
140
+ The type parameters are:
141
+
142
+ - `E`: the escape character
143
+ - `OQ`: the open quote character
144
+ - `CQ`: the close quote character
145
+ - `NL`: the newline character
146
+ - `IO_t`: the type of the IO object, e.g. `IOBuffer` or `IOStream`
147
+
148
+ Either `E`, `OQ`, and `CQ` are all `Nothing`, or they are all single-byte characters.
149
+
150
+ When `E`, `OQ`, and `CQ` are not `Nothing`, the lexer will find all newlines in the input,
151
+ that are not inside a string (between two quotes). This is useful for finding record separators
152
+ in CSVs.
153
+
154
+ If they are all `Nothing`, the lexer will be quote-unaware, and find all newlines in the input,
155
+ regardless of whether they are inside a string or not. You can construct such a lexer with
156
+ `Lexer(io, nothing, newline)`.
157
+ """
126
158
mutable struct Lexer{E,OQ,CQ,NL,IO_t}
127
159
@constfield io:: IO_t
128
160
@constfield escape:: Vec{64, UInt8}
129
161
@constfield quotechar:: Vec{64, UInt8}
130
162
@constfield newline:: Vec{64, UInt8}
131
- prev_escaped:: UInt # 0 or 1
132
- prev_in_string:: UInt # 0 or typemax(UInt)
163
+ prev_escaped:: UInt # 0 or 1, see the tables above
164
+ prev_in_string:: UInt # 0 or typemax(UInt), see the tables above
133
165
done:: Bool # Right now, this is not used but could be set by the caller
134
166
135
167
function Lexer (
192
224
possibly_not_in_string (l:: Lexer{Q,Q,Q} ) where {Q} = (l. prev_in_string & UInt (1 )) == l. prev_escaped
193
225
possibly_not_in_string (l:: Lexer{E,Q} ) where {E,Q} = l. prev_in_string == 0
194
226
227
+ # This is where we process 64 byte input when the quotechar and escapechar are identical.
228
+ # This is our adaptation of the original `simdjson` implementation which handles the escaping rules
229
+ # common in CSVs.
230
+ #
231
+ # An example showing intermediate results when parsing 64 bytes with one quoted newline in a
232
+ # string and one unquoted newline:
233
+ # Note all the bits are reversed for readability. `*` marks the newlines.
234
+ #
235
+ # "abc,"quoted,field","quoted*newline","escaped"" """" """"",01234*"
236
+ # X 0b0000100000000000010100000000000000101000000011011110111110000000
237
+ # F 0b0000010000000000001010000000000000010100000001101111011111000000
238
+ # SEQ 0b0000100000000000010100000000000000101000000010010000100000000000
239
+ # EB 0b1010101010101010101010101010101010101010101010101010101010101010
240
+ # OS 0b0000000000000000010100000000000000000000000000010000000000000000
241
+ # ES 0b0000100000000000000000000000000000101000000010000000100000000000
242
+ # OC 0b0000000000000000001010000000000000000000000000000000000000000000
243
+ # EC 0b0000010000000000000000000000000000010100000000000000000001000000
244
+ # Q 0b0000100000000000010100000000000000101000000000000000000010000000
245
+ # PX 0b0000111111111111100111111111111111001111111111111111111100000000
246
+ # STR 0b0000111111111111100111111111111111001111111111111111111100000000
247
+ # NL 0b0000000000000000000000000000000000000000000000000000000000000001
248
+ # "abc,"quoted,field","quoted*newline","escaped"" """" """"",01234*
195
249
@inline function _find_newlines_kernel! (l:: Lexer{Q,Q,Q} , input:: Vec{64, UInt8} ) where {Q}
196
250
escape_chars = compress_escapes (l, input)
197
251
follows_escape = escape_chars << 1
@@ -224,7 +278,9 @@ possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0
224
278
# This ignores strings that are entirely made up of quotes, e.g. "", """", etc.
225
279
# But those cannot contain newlines so we don't care
226
280
# Shift by one as carries are always one bit off due to the addition 0b0001 + 0b0001 = 0b0010
227
- quotes = ((even_string_starts | odd_string_starts) >> 1 ) ⊻ l. prev_escaped # TODO : explain the xor and how it works with the prev_escaped
281
+ # When `l.prev_escaped` is set, it means we ended on an unescaped quote, so we need to add
282
+ # it here.
283
+ quotes = ((even_string_starts | odd_string_starts) >> 1 ) ⊻ l. prev_escaped
228
284
in_string = prefix_xor (quotes) ⊻ l. prev_in_string
229
285
newlines = compress_newlines (l, input) & ~ in_string
230
286
@@ -253,6 +309,25 @@ possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0
253
309
return newlines
254
310
end
255
311
312
+ # This is where we process 64 byte input when the quotechar and escapechar are different characters.
313
+ # In this case we follow the implementation from `simdjson`.
314
+ # See section "3.1.1 Identification of the quoted substrings" in https://arxiv.org/pdf/1902.08318.pdf
315
+ #
316
+ # An example showing intermediate results when parsing 64 bytes with one quoted newline in a
317
+ # string and one unquoted newline:
318
+ # Note all the bits are reversed for readability. `*` marks the newlines.
319
+ #
320
+ # "abc,"quoted,field","quoted*newline","escaped\\ \\\\ \"\"",01234*"
321
+ # X 0b0000000000000000000000000000000000000000000011011110101000000000
322
+ # F 0b0000000000000000000000000000000000000000000001101111010100000000 X << 1 | l.prev_escaped
323
+ # EB 0b1010101010101010101010101010101010101010101010101010101010101010
324
+ # OS 0b0000000000000000000000000000000000000000000000010000000000000000 X & ~EB & ~F
325
+ # EC 0b0000000000000000000000000000000000000000000011000001101000000000 X + OS
326
+ # IM 0b0000000000000000000000000000000000000000000001100000110100000000 EC << 1
327
+ # E 0b0000000000000000000000000000000000000000000001001010010100000000 (EB ⊻ IM) & F
328
+ # Q 0b0000100000000000010100000000000000101000000000000000000010000000 quotes & ~E
329
+ # STR 0b0000111111111111100111111111111111001111111111111111111100000000 CLMUL(Q)
330
+ # NL 0b0000000000000000000000000000000000000000000000000000000000000001
256
331
@inline function _find_newlines_kernel! (l:: Lexer{E,Q,Q} , input:: Vec{64, UInt8} ) where {E,Q}
257
332
escape_chars = compress_escapes (l, input) & ~ l. prev_escaped
258
333
follows_escape = escape_chars << 1 | l. prev_escaped
277
352
# "\nIM 0b", bitstring(SIMD.Intrinsics.bitreverse(invert_mask)), " EC << 1",
278
353
# "\nE 0b", bitstring(SIMD.Intrinsics.bitreverse(escaped)), " (EB ⊻ IM) & F",
279
354
# "\nQ 0b", bitstring(SIMD.Intrinsics.bitreverse(quotes)), " quotes & ~E",
280
- # "\nS 0b", bitstring(SIMD.Intrinsics.bitreverse(in_string)), " CLMUL(Q)",
281
- # "\nL 0b", bitstring(SIMD.Intrinsics.bitreverse(newlines)),
355
+ # "\nSTR 0b", bitstring(SIMD.Intrinsics.bitreverse(in_string)), " CLMUL(Q)",
356
+ # "\nNL 0b", bitstring(SIMD.Intrinsics.bitreverse(newlines)),
282
357
# "\n \"", replace(join(map(x->Char(x.value), collect(input.data))), "\n" => "*"), "\"",
283
358
# "\n[E] 0b", bitstring(SIMD.Intrinsics.bitreverse(UInt(_overflowed_odd))),
284
359
# "\n[Q] 0b", bitstring((in_string >> 63) * typemax(UInt)),
289
364
return newlines
290
365
end
291
366
367
+ # Generic fallback for when open and close quote differs and when buffer, or its last trailing bytes are too small for SIMD, i.e. < 64 bytes).
292
368
function _find_newlines_generic! (l:: Lexer{E,OQ,CQ} , buf, out, curr_pos:: Int = firstindex (buf), end_pos:: Int = lastindex (buf)) where {E,OQ,CQ}
293
- @assert 1 <= curr_pos <= end_pos <= length (buf) <= typemax (Int32)
369
+ @assert ( 1 <= curr_pos <= end_pos <= length (buf) && end_pos <= typemax (Int32) )
294
370
structural_characters = _scanbyte_bytes (l)
295
371
296
372
ptr = pointer (buf)
@@ -388,8 +464,9 @@ function _find_newlines_generic!(l::Lexer{E,OQ,CQ}, buf, out, curr_pos::Int=firs
388
464
return nothing
389
465
end
390
466
467
+ # Quote-unaware lexer we use for trailing bytes of inputs with length that is not a multiple of 64.
391
468
function _find_newlines_quote_unaware_scanbyte! (l:: Lexer{E,OQ,CQ,NL} , buf:: Vector{UInt8} , out:: AbstractVector{Int32} , curr_pos:: Int = firstindex (buf), end_pos:: Int = lastindex (buf)) where {E,OQ,CQ,NL}
392
- @assert 1 <= curr_pos <= end_pos <= length (buf) <= typemax (Int32)
469
+ @assert ( 1 <= curr_pos <= end_pos <= length (buf) && end_pos <= typemax (Int32) )
393
470
ptr = pointer (buf, curr_pos)
394
471
bytes_to_search = end_pos - curr_pos + 1
395
472
base = Int32 (curr_pos - 1 )
@@ -409,8 +486,9 @@ function _find_newlines_quote_unaware_scanbyte!(l::Lexer{E,OQ,CQ,NL}, buf::Vecto
409
486
end
410
487
end
411
488
489
+ # Quote-unaware lexer which handles 64-byte aligned buffers and leaves the rest to `_find_newlines_quote_unaware_scanbyte!`.
412
490
function _find_newlines_quote_unaware_simd! (l:: Lexer , buf:: Vector{UInt8} , out:: AbstractVector{Int32} , curr_pos:: Int = firstindex (buf), end_pos:: Int = lastindex (buf))
413
- @assert 1 <= curr_pos <= end_pos <= length (buf) <= typemax (Int32)
491
+ @assert ( 1 <= curr_pos <= end_pos <= length (buf) && end_pos <= typemax (Int32) )
414
492
base = unsafe_trunc (Int32, curr_pos)
415
493
@inbounds while curr_pos <= (end_pos - 63 )
416
494
input = vload (Vec{64 , UInt8}, buf, curr_pos)
@@ -429,22 +507,32 @@ function _find_newlines_quote_unaware_simd!(l::Lexer, buf::Vector{UInt8}, out::A
429
507
end
430
508
end
431
509
510
+ """
511
+ find_newlines!(l::Lexer, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf))
512
+
513
+ Find newlines in `buf[curr_pos:end_pos]` and push their positions to `out`. The newline positions are relative to the beginning of `buf`.
514
+ The type of the `Lexer` determines the rules for handling quotes and escapes. See `Lexer` for details.
515
+
516
+ `end_pos` must be less than `typemax(Int32)` and `1 <= curr_pos <= end_pos`.
517
+ """
518
+ function find_newlines! end
519
+
432
520
# Generic fallback for when open and close quote differs (should be also used in case the buffer is too small for SIMD).
433
521
function find_newlines! (l:: Lexer{E,OQ,CQ} , buf:: Vector{UInt8} , out:: AbstractVector{Int32} , curr_pos:: Int = firstindex (buf), end_pos:: Int = lastindex (buf)) where {E,OQ,CQ}
434
- 1 <= curr_pos <= end_pos <= length (buf) <= typemax (Int32) || throw (ArgumentError (" Invalid range: $curr_pos :$end_pos , must be 1 <= curr_pos <= end_pos <= $(length (buf)) <= $(typemax (Int32)) " ))
522
+ ( 1 <= curr_pos <= end_pos <= length (buf) && end_pos <= typemax (Int32) ) || throw (ArgumentError (" Invalid range: $curr_pos :$end_pos , must be 1 <= curr_pos <= end_pos <= $(length (buf)) <= $(typemax (Int32)) " ))
435
523
_find_newlines_generic! (l, buf, out, curr_pos, end_pos)
436
524
return nothing
437
525
end
438
526
# Fast path for when no newlines may appear inside quotes.
439
527
function find_newlines! (l:: Lexer{Nothing,Nothing,Nothing} , buf:: Vector{UInt8} , out:: AbstractVector{Int32} , curr_pos:: Int = firstindex (buf), end_pos:: Int = lastindex (buf))
440
- 1 <= curr_pos <= end_pos <= length (buf) <= typemax (Int32) || throw (ArgumentError (" Invalid range: $curr_pos :$end_pos , must be 1 <= curr_pos <= end_pos <= $(length (buf)) <= $(typemax (Int32)) " ))
528
+ ( 1 <= curr_pos <= end_pos <= length (buf) && end_pos <= typemax (Int32) ) || throw (ArgumentError (" Invalid range: $curr_pos :$end_pos , must be 1 <= curr_pos <= end_pos <= $(length (buf)) <= $(typemax (Int32)) " ))
441
529
_find_newlines_quote_unaware_simd! (l, buf, out, curr_pos, end_pos)
442
530
return nothing
443
531
end
444
532
445
533
# Path for when open and close quote are the same (escape might be different or the same as the quote)
446
534
function find_newlines! (l:: Lexer{E,Q,Q} , buf:: Vector{UInt8} , out:: AbstractVector{Int32} , curr_pos:: Int = firstindex (buf), end_pos:: Int = lastindex (buf)) where {E,Q}
447
- 1 <= curr_pos <= end_pos <= length (buf) <= typemax (Int32) || throw (ArgumentError (" Invalid range: $curr_pos :$end_pos , must be 1 <= curr_pos <= end_pos <= $(length (buf)) <= $(typemax (Int32)) " ))
535
+ ( 1 <= curr_pos <= end_pos <= length (buf) && end_pos <= typemax (Int32) ) || throw (ArgumentError (" Invalid range: $curr_pos :$end_pos , must be 1 <= curr_pos <= end_pos <= $(length (buf)) <= $(typemax (Int32)) " ))
448
536
base = unsafe_trunc (Int32, curr_pos)
449
537
@inbounds while curr_pos <= (end_pos - 63 )
450
538
input = vload (Vec{64 , UInt8}, buf, curr_pos)
0 commit comments