Skip to content

Commit a297b35

Browse files
Merge pull request #8 from JuliaData/td-docs-tweaks
[NFC] Add more comments
2 parents e1bfb0c + e9327b9 commit a297b35

File tree

2 files changed

+100
-12
lines changed

2 files changed

+100
-12
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# NewlineLexers.jl
22

3-
Quote-aware newline finder.
3+
Quote-aware newline finder. By default it uses a branchless algorithm to find newlines, and is able to skip those which appear inside string fields. This is useful for parsing CSV files, for example, where we want to quickly find all newlines that separate individual records.
44

55
```julia
66
julia> data = collect(codeunits(""" abc\n "efg\n" \n """));

src/NewlineLexers.jl

+99-11
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ let # Feature detection -- copied from ScanByte.jl
4848
features = split(unsafe_string(features_cstring), ',')
4949
Libc.free(features_cstring)
5050

51+
# prefix_xor works like this: it goes through the bits of the input UInt from least significant to most significant
52+
# and it xors the current bit with the previous one. This means that it starts producing 0 until it meets the first 1, then
53+
# it starts producing 1s until it meets the next 1, then it starts producing 0s again, etc.
54+
# Example:
55+
# 0b00001000010
56+
# -> 0b00000111110
5157
@eval if _AVOID_PLATFORM_SPECIFIC_LLVM_CODE || !any(x->occursin("clmul", x), $(features))
5258
@inline function prefix_xor(q)
5359
mask = q (q << 1)
@@ -59,6 +65,7 @@ let # Feature detection -- copied from ScanByte.jl
5965
return mask
6066
end
6167
else
68+
# Cool explainer on carryless multiplication: https://wunkolo.github.io/post/2020/05/pclmulqdq-tricks/
6269
function carrylessmul(a::NTuple{2,VecElement{UInt64}}, b::NTuple{2,VecElement{UInt64}})
6370
ccall("llvm.x86.pclmulqdq", llvmcall, NTuple{2,VecElement{UInt64}}, (NTuple{2,VecElement{UInt64}}, NTuple{2,VecElement{UInt64}}, UInt8), a, b, 0)
6471
end
@@ -123,13 +130,38 @@ end
123130
# These must be respected in the `_find_newlines_kernel!` and `_find_newlines_generic!`
124131
# All other cases are unambiguous, i.e. we can tell if we are inside a string or not,
125132
# and if we are, we can tell if we are on an escape or not.
133+
"""
134+
Lexer{E,OQ,CQ,NL,IO_t}
135+
136+
Lexer(io, escapechar, openquotechar, closequotechar, newline) -> Lexer{E,OQ,CQ,NL,IO_t}
137+
Lexer(io, nothing, newline) -> Lexer{Nothing,Nothing,Nothing,NL,IO_t}
138+
139+
A stateful lexer type for newline detection. Use with the `find_newlines!` function.
140+
The type parameters are:
141+
142+
- `E`: the escape character
143+
- `OQ`: the open quote character
144+
- `CQ`: the close quote character
145+
- `NL`: the newline character
146+
- `IO_t`: the type of the IO object, e.g. `IOBuffer` or `IOStream`
147+
148+
Either `E`, `OQ`, and `CQ` are all `Nothing`, or they are all single-byte characters.
149+
150+
When `E`, `OQ`, and `CQ` are not `Nothing`, the lexer will find all newlines in the input,
151+
that are not inside a string (between two quotes). This is useful for finding record separators
152+
in CSVs.
153+
154+
If they are all `Nothing`, the lexer will be quote-unaware, and find all newlines in the input,
155+
regardless of whether they are inside a string or not. You can construct such a lexer with
156+
`Lexer(io, nothing, newline)`.
157+
"""
126158
mutable struct Lexer{E,OQ,CQ,NL,IO_t}
127159
@constfield io::IO_t
128160
@constfield escape::Vec{64, UInt8}
129161
@constfield quotechar::Vec{64, UInt8}
130162
@constfield newline::Vec{64, UInt8}
131-
prev_escaped::UInt # 0 or 1
132-
prev_in_string::UInt # 0 or typemax(UInt)
163+
prev_escaped::UInt # 0 or 1, see the tables above
164+
prev_in_string::UInt # 0 or typemax(UInt), see the tables above
133165
done::Bool # Right now, this is not used but could be set by the caller
134166

135167
function Lexer(
@@ -192,6 +224,28 @@ end
192224
possibly_not_in_string(l::Lexer{Q,Q,Q}) where {Q} = (l.prev_in_string & UInt(1)) == l.prev_escaped
193225
possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0
194226

227+
# This is where we process 64 byte input when the quotechar and escapechar are identical.
228+
# This is our adaptation of the original `simdjson` implementation which handles the escaping rules
229+
# common in CSVs.
230+
#
231+
# An example showing intermediate results when parsing 64 bytes with one quoted newline in a
232+
# string and one unquoted newline:
233+
# Note all the bits are reversed for readability. `*` marks the newlines.
234+
#
235+
# "abc,"quoted,field","quoted*newline","escaped"" """" """"",01234*"
236+
# X 0b0000100000000000010100000000000000101000000011011110111110000000
237+
# F 0b0000010000000000001010000000000000010100000001101111011111000000
238+
# SEQ 0b0000100000000000010100000000000000101000000010010000100000000000
239+
# EB 0b1010101010101010101010101010101010101010101010101010101010101010
240+
# OS 0b0000000000000000010100000000000000000000000000010000000000000000
241+
# ES 0b0000100000000000000000000000000000101000000010000000100000000000
242+
# OC 0b0000000000000000001010000000000000000000000000000000000000000000
243+
# EC 0b0000010000000000000000000000000000010100000000000000000001000000
244+
# Q 0b0000100000000000010100000000000000101000000000000000000010000000
245+
# PX 0b0000111111111111100111111111111111001111111111111111111100000000
246+
# STR 0b0000111111111111100111111111111111001111111111111111111100000000
247+
# NL 0b0000000000000000000000000000000000000000000000000000000000000001
248+
# "abc,"quoted,field","quoted*newline","escaped"" """" """"",01234*
195249
@inline function _find_newlines_kernel!(l::Lexer{Q,Q,Q}, input::Vec{64, UInt8}) where {Q}
196250
escape_chars = compress_escapes(l, input)
197251
follows_escape = escape_chars << 1
@@ -224,7 +278,9 @@ possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0
224278
# This ignores strings that are entirely made up of quotes, e.g. "", """", etc.
225279
# But those cannot contain newlines so we don't care
226280
# Shift by one as carries are always one bit off due to the addition 0b0001 + 0b0001 = 0b0010
227-
quotes = ((even_string_starts | odd_string_starts) >> 1) l.prev_escaped # TODO: explain the xor and how it works with the prev_escaped
281+
# When `l.prev_escaped` is set, it means we ended on an unescaped quote, so we need to add
282+
# it here.
283+
quotes = ((even_string_starts | odd_string_starts) >> 1) l.prev_escaped
228284
in_string = prefix_xor(quotes) l.prev_in_string
229285
newlines = compress_newlines(l, input) & ~in_string
230286

@@ -253,6 +309,25 @@ possibly_not_in_string(l::Lexer{E,Q}) where {E,Q} = l.prev_in_string == 0
253309
return newlines
254310
end
255311

312+
# This is where we process 64 byte input when the quotechar and escapechar are different characters.
313+
# In this case we follow the implementation from `simdjson`.
314+
# See section "3.1.1 Identification of the quoted substrings" in https://arxiv.org/pdf/1902.08318.pdf
315+
#
316+
# An example showing intermediate results when parsing 64 bytes with one quoted newline in a
317+
# string and one unquoted newline:
318+
# Note all the bits are reversed for readability. `*` marks the newlines.
319+
#
320+
# "abc,"quoted,field","quoted*newline","escaped\\ \\\\ \"\"",01234*"
321+
# X 0b0000000000000000000000000000000000000000000011011110101000000000
322+
# F 0b0000000000000000000000000000000000000000000001101111010100000000 X << 1 | l.prev_escaped
323+
# EB 0b1010101010101010101010101010101010101010101010101010101010101010
324+
# OS 0b0000000000000000000000000000000000000000000000010000000000000000 X & ~EB & ~F
325+
# EC 0b0000000000000000000000000000000000000000000011000001101000000000 X + OS
326+
# IM 0b0000000000000000000000000000000000000000000001100000110100000000 EC << 1
327+
# E 0b0000000000000000000000000000000000000000000001001010010100000000 (EB ⊻ IM) & F
328+
# Q 0b0000100000000000010100000000000000101000000000000000000010000000 quotes & ~E
329+
# STR 0b0000111111111111100111111111111111001111111111111111111100000000 CLMUL(Q)
330+
# NL 0b0000000000000000000000000000000000000000000000000000000000000001
256331
@inline function _find_newlines_kernel!(l::Lexer{E,Q,Q}, input::Vec{64, UInt8}) where {E,Q}
257332
escape_chars = compress_escapes(l, input) & ~l.prev_escaped
258333
follows_escape = escape_chars << 1 | l.prev_escaped
@@ -277,8 +352,8 @@ end
277352
# "\nIM 0b", bitstring(SIMD.Intrinsics.bitreverse(invert_mask)), " EC << 1",
278353
# "\nE 0b", bitstring(SIMD.Intrinsics.bitreverse(escaped)), " (EB ⊻ IM) & F",
279354
# "\nQ 0b", bitstring(SIMD.Intrinsics.bitreverse(quotes)), " quotes & ~E",
280-
# "\nS 0b", bitstring(SIMD.Intrinsics.bitreverse(in_string)), " CLMUL(Q)",
281-
# "\nL 0b", bitstring(SIMD.Intrinsics.bitreverse(newlines)),
355+
# "\nSTR 0b", bitstring(SIMD.Intrinsics.bitreverse(in_string)), " CLMUL(Q)",
356+
# "\nNL 0b", bitstring(SIMD.Intrinsics.bitreverse(newlines)),
282357
# "\n \"", replace(join(map(x->Char(x.value), collect(input.data))), "\n" => "*"), "\"",
283358
# "\n[E] 0b", bitstring(SIMD.Intrinsics.bitreverse(UInt(_overflowed_odd))),
284359
# "\n[Q] 0b", bitstring((in_string >> 63) * typemax(UInt)),
@@ -289,8 +364,9 @@ end
289364
return newlines
290365
end
291366

367+
# Generic fallback for when open and close quote differs and when buffer, or its last trailing bytes are too small for SIMD, i.e. < 64 bytes).
292368
function _find_newlines_generic!(l::Lexer{E,OQ,CQ}, buf, out, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,OQ,CQ}
293-
@assert 1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32)
369+
@assert (1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32))
294370
structural_characters = _scanbyte_bytes(l)
295371

296372
ptr = pointer(buf)
@@ -388,8 +464,9 @@ function _find_newlines_generic!(l::Lexer{E,OQ,CQ}, buf, out, curr_pos::Int=firs
388464
return nothing
389465
end
390466

467+
# Quote-unaware lexer we use for trailing bytes of inputs with length that is not a multiple of 64.
391468
function _find_newlines_quote_unaware_scanbyte!(l::Lexer{E,OQ,CQ,NL}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,OQ,CQ,NL}
392-
@assert 1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32)
469+
@assert (1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32))
393470
ptr = pointer(buf, curr_pos)
394471
bytes_to_search = end_pos - curr_pos + 1
395472
base = Int32(curr_pos - 1)
@@ -409,8 +486,9 @@ function _find_newlines_quote_unaware_scanbyte!(l::Lexer{E,OQ,CQ,NL}, buf::Vecto
409486
end
410487
end
411488

489+
# Quote-unaware lexer which handles 64-byte aligned buffers and leaves the rest to `_find_newlines_quote_unaware_scanbyte!`.
412490
function _find_newlines_quote_unaware_simd!(l::Lexer, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf))
413-
@assert 1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32)
491+
@assert (1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32))
414492
base = unsafe_trunc(Int32, curr_pos)
415493
@inbounds while curr_pos <= (end_pos - 63)
416494
input = vload(Vec{64, UInt8}, buf, curr_pos)
@@ -429,22 +507,32 @@ function _find_newlines_quote_unaware_simd!(l::Lexer, buf::Vector{UInt8}, out::A
429507
end
430508
end
431509

510+
"""
511+
find_newlines!(l::Lexer, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf))
512+
513+
Find newlines in `buf[curr_pos:end_pos]` and push their positions to `out`. The newline positions are relative to the beginning of `buf`.
514+
The type of the `Lexer` determines the rules for handling quotes and escapes. See `Lexer` for details.
515+
516+
`end_pos` must be less than `typemax(Int32)` and `1 <= curr_pos <= end_pos`.
517+
"""
518+
function find_newlines! end
519+
432520
# Generic fallback for when open and close quote differs (should be also used in case the buffer is too small for SIMD).
433521
function find_newlines!(l::Lexer{E,OQ,CQ}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,OQ,CQ}
434-
1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
522+
(1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32)) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
435523
_find_newlines_generic!(l, buf, out, curr_pos, end_pos)
436524
return nothing
437525
end
438526
# Fast path for when no newlines may appear inside quotes.
439527
function find_newlines!(l::Lexer{Nothing,Nothing,Nothing}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf))
440-
1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
528+
(1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32)) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
441529
_find_newlines_quote_unaware_simd!(l, buf, out, curr_pos, end_pos)
442530
return nothing
443531
end
444532

445533
# Path for when open and close quote are the same (escape might be different or the same as the quote)
446534
function find_newlines!(l::Lexer{E,Q,Q}, buf::Vector{UInt8}, out::AbstractVector{Int32}, curr_pos::Int=firstindex(buf), end_pos::Int=lastindex(buf)) where {E,Q}
447-
1 <= curr_pos <= end_pos <= length(buf) <= typemax(Int32) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
535+
(1 <= curr_pos <= end_pos <= length(buf) && end_pos <= typemax(Int32)) || throw(ArgumentError("Invalid range: $curr_pos:$end_pos, must be 1 <= curr_pos <= end_pos <= $(length(buf)) <= $(typemax(Int32))"))
448536
base = unsafe_trunc(Int32, curr_pos)
449537
@inbounds while curr_pos <= (end_pos - 63)
450538
input = vload(Vec{64, UInt8}, buf, curr_pos)

0 commit comments

Comments
 (0)