|
// Notes on standards compliance: https://datatracker.ietf.org/doc/html/rfc8259
// * RFC 8259 requires JSON documents be valid UTF-8,
// but makes an allowance for systems that are "part of a closed ecosystem".
// I have no idea what that's supposed to mean in the context of a standard specification.
// This implementation requires inputs to be valid UTF-8.
// * RFC 8259 contradicts itself regarding whether lowercase is allowed in \u hex digits,
// but this is probably a bug in the spec, and it's clear that lowercase is meant to be allowed.
// (RFC 5234 defines HEXDIG to only allow uppercase.)
// * When RFC 8259 refers to a "character", I assume they really mean a "Unicode scalar value".
// See http://www.unicode.org/glossary/#unicode_scalar_value .
// * RFC 8259 doesn't explicitly disallow unpaired surrogate halves in \u escape sequences,
// but vaguely implies that \u escapes are for encoding Unicode "characters" (i.e. Unicode scalar values?),
// which would mean that unpaired surrogate halves are forbidden.
// By contrast ECMA-404 (a competing(/compatible?) JSON standard, which JavaScript's JSON.parse() conforms to)
// explicitly allows unpaired surrogate halves.
// This implementation forbids unpaired surrogate halves in \u sequences.
// If a high surrogate half appears in a \u sequence,
// then a low surrogate half must immediately follow in \u notation.
// * RFC 8259 allows implementations to "accept non-JSON forms or extensions".
// This implementation does not accept any of that.
// * RFC 8259 allows implementations to put limits on "the size of texts",
// "the maximum depth of nesting", "the range and precision of numbers",
// and "the length and character contents of strings".
// This low-level implementation does not limit these,
// except where noted above, and except that nesting depth requires memory allocation.
// Note that this low-level API does not interpret numbers numerically,
// but simply emits their source form for some higher level code to make sense of.
// * This low-level implementation allows duplicate object keys,
// and key/value pairs are emitted in the order they appear in the input.
const std = @import("std");
const Allocator = std.mem.Allocator;
const ArrayList = std.ArrayList;
const assert = std.debug.assert;
const BitStack = std.BitStack;
|
validate() Scan the input and check for malformed JSON. On |
pub fn validate(allocator: Allocator, s: []const u8) Allocator.Error!bool {
var scanner = Scanner.initCompleteInput(allocator, s);
defer scanner.deinit();
while (true) {
const token = scanner.next() catch |err| switch (err) {
error.SyntaxError, error.UnexpectedEndOfInput => return false,
error.OutOfMemory => return error.OutOfMemory,
error.BufferUnderrun => unreachable,
};
if (token == .end_of_document) break;
}
return true;
}
|
Error The parsing errors are divided into two categories: * |
pub const Error = error{ SyntaxError, UnexpectedEndOfInput };
|
reader() Calls |
pub fn reader(allocator: Allocator, io_reader: anytype) Reader(default_buffer_size, @TypeOf(io_reader)) {
return Reader(default_buffer_size, @TypeOf(io_reader)).init(allocator, io_reader);
}
|
default_buffer_size Used by |
pub const default_buffer_size = 0x1000; |
Token The tokens emitted by |
pub const Token = union(enum) {
object_begin,
object_end,
array_begin,
array_end,
true,
false,
null,
number: []const u8,
partial_number: []const u8,
allocated_number: []u8,
string: []const u8,
partial_string: []const u8,
partial_string_escaped_1: [1]u8,
partial_string_escaped_2: [2]u8,
partial_string_escaped_3: [3]u8,
partial_string_escaped_4: [4]u8,
allocated_string: []u8,
end_of_document,
};
|
TokenType This is only used in |
pub const TokenType = enum {
object_begin,
object_end,
array_begin,
array_end,
true,
false,
null,
number,
string,
end_of_document,
};
|
Diagnostics To enable diagnostics, declare |
pub const Diagnostics = struct {
line_number: u64 = 1,
line_start_cursor: usize = @as(usize, @bitCast(@as(isize, -1))), // Start just "before" the input buffer to get a 1-based column for line 1.
total_bytes_before_current_input: u64 = 0,
cursor_pointer: *const usize = undefined,
|
getLine()Starts at 1. |
pub fn getLine(self: *const @This()) u64 {
return self.line_number;
}
|
getColumn()Starts at 1. |
pub fn getColumn(self: *const @This()) u64 {
return self.cursor_pointer.* -% self.line_start_cursor;
}
|
getByteOffset()Starts at 0. Measures the byte offset since the start of the input. |
pub fn getByteOffset(self: *const @This()) u64 {
return self.total_bytes_before_current_input + self.cursor_pointer.*;
}
};
|
AllocWhen See the documentation for |
pub const AllocWhen = enum { alloc_if_needed, alloc_always };
|
default_max_value_len For security, the maximum size allocated to store a single string or number value is limited to 4MiB by default. This limit can be specified by calling |
pub const default_max_value_len = 4 * 1024 * 1024; |
Reader() Connects a |
pub fn Reader(comptime buffer_size: usize, comptime ReaderType: type) type {
return struct {
scanner: Scanner,
reader: ReaderType,
buffer: [buffer_size]u8 = undefined,
|
init() The allocator is only used to track |
pub fn init(allocator: Allocator, io_reader: ReaderType) @This() {
return .{
.scanner = Scanner.initStreaming(allocator),
.reader = io_reader,
};
}
|
deinit() |
pub fn deinit(self: *@This()) void {
self.scanner.deinit();
self.* = undefined;
}
|
enableDiagnostics() Calls |
pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void {
self.scanner.enableDiagnostics(diagnostics);
}
pub const NextError = ReaderType.Error || Error || Allocator.Error;
pub const SkipError = NextError;
pub const AllocError = NextError || error{ValueTooLong};
pub const PeekError = ReaderType.Error || Error;
|
nextAlloc() Equivalent to |
pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) AllocError!Token {
return self.nextAllocMax(allocator, when, default_max_value_len);
}
|
nextAllocMax() See also |
pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) AllocError!Token {
const token_type = try self.peekNextTokenType();
switch (token_type) {
.number, .string => {
var value_list = ArrayList(u8).init(allocator);
errdefer {
value_list.deinit();
}
if (try self.allocNextIntoArrayListMax(&value_list, when, max_value_len)) |slice| {
return if (token_type == .number)
Token{ .number = slice }
else
Token{ .string = slice };
} else {
return if (token_type == .number)
Token{ .allocated_number = try value_list.toOwnedSlice() }
else
Token{ .allocated_string = try value_list.toOwnedSlice() };
}
},
// Simple tokens never alloc.
.object_begin,
.object_end,
.array_begin,
.array_end,
.true,
.false,
.null,
.end_of_document,
=> return try self.next(),
}
}
|
allocNextIntoArrayList() Equivalent to |
pub fn allocNextIntoArrayList(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen) AllocError!?[]const u8 {
return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len);
}
|
allocNextIntoArrayListMax() Calls |
pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen, max_value_len: usize) AllocError!?[]const u8 {
while (true) {
return self.scanner.allocNextIntoArrayListMax(value_list, when, max_value_len) catch |err| switch (err) {
error.BufferUnderrun => {
try self.refillBuffer();
continue;
},
else => |other_err| return other_err,
};
}
}
|
skipValue() Like |
pub fn skipValue(self: *@This()) SkipError!void {
switch (try self.peekNextTokenType()) {
.object_begin, .array_begin => {
try self.skipUntilStackHeight(self.stackHeight());
},
.number, .string => {
while (true) {
switch (try self.next()) {
.partial_number,
.partial_string,
.partial_string_escaped_1,
.partial_string_escaped_2,
.partial_string_escaped_3,
.partial_string_escaped_4,
=> continue,
.number, .string => break,
else => unreachable,
}
}
},
.true, .false, .null => {
_ = try self.next();
},
.object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token.
}
}
|
skipUntilStackHeight() Like |
pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) NextError!void {
while (true) {
return self.scanner.skipUntilStackHeight(terminal_stack_height) catch |err| switch (err) {
error.BufferUnderrun => {
try self.refillBuffer();
continue;
},
else => |other_err| return other_err,
};
}
}
|
stackHeight() Calls |
pub fn stackHeight(self: *const @This()) usize {
return self.scanner.stackHeight();
}
|
ensureTotalStackCapacity() Calls |
pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void {
try self.scanner.ensureTotalStackCapacity(height);
}
|
next() See |
pub fn next(self: *@This()) NextError!Token {
while (true) {
return self.scanner.next() catch |err| switch (err) {
error.BufferUnderrun => {
try self.refillBuffer();
continue;
},
else => |other_err| return other_err,
};
}
}
|
peekNextTokenType() See |
pub fn peekNextTokenType(self: *@This()) PeekError!TokenType {
while (true) {
return self.scanner.peekNextTokenType() catch |err| switch (err) {
error.BufferUnderrun => {
try self.refillBuffer();
continue;
},
else => |other_err| return other_err,
};
}
}
fn refillBuffer(self: *@This()) ReaderType.Error!void {
const input = self.buffer[0..try self.reader.read(self.buffer[0..])];
if (input.len > 0) {
self.scanner.feedInput(input);
} else {
self.scanner.endInput();
}
}
};
}
|
Scanner The lowest level parsing API in this package; supports streaming input with a low memory footprint. The memory requirement is |
pub const Scanner = struct {
state: State = .value,
string_is_object_key: bool = false,
stack: BitStack,
value_start: usize = undefined,
utf16_code_units: [2]u16 = undefined,
input: []const u8 = "",
cursor: usize = 0,
is_end_of_input: bool = false,
diagnostics: ?*Diagnostics = null,
|
initStreaming() The allocator is only used to track |
pub fn initStreaming(allocator: Allocator) @This() {
return .{
.stack = BitStack.init(allocator),
};
}
|
initCompleteInput()Use this if your input is a single slice. This is effectively equivalent to: ``` initStreaming(allocator); feedInput(complete_input); endInput(); ``` |
pub fn initCompleteInput(allocator: Allocator, complete_input: []const u8) @This() {
return .{
.stack = BitStack.init(allocator),
.input = complete_input,
.is_end_of_input = true,
};
}
|
deinit() |
pub fn deinit(self: *@This()) void {
self.stack.deinit();
self.* = undefined;
}
|
enableDiagnostics() |
pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void {
diagnostics.cursor_pointer = &self.cursor;
self.diagnostics = diagnostics;
}
|
feedInput() Call this whenever you get |
pub fn feedInput(self: *@This(), input: []const u8) void {
assert(self.cursor == self.input.len); // Not done with the last input slice.
if (self.diagnostics) |diag| {
diag.total_bytes_before_current_input += self.input.len;
// This usually goes "negative" to measure how far before the beginning
// of the new buffer the current line started.
diag.line_start_cursor -%= self.cursor;
}
self.input = input;
self.cursor = 0;
self.value_start = 0;
}
|
endInput() Call this when you will no longer call |
pub fn endInput(self: *@This()) void {
self.is_end_of_input = true;
}
pub const NextError = Error || Allocator.Error || error{BufferUnderrun};
pub const AllocError = Error || Allocator.Error || error{ValueTooLong};
pub const PeekError = Error || error{BufferUnderrun};
pub const SkipError = Error || Allocator.Error;
pub const AllocIntoArrayListError = AllocError || error{BufferUnderrun};
|
nextAlloc() Equivalent to |
pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) AllocError!Token {
return self.nextAllocMax(allocator, when, default_max_value_len);
}
|
nextAllocMax() This function is only available after |
pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) AllocError!Token {
assert(self.is_end_of_input); // This function is not available in streaming mode.
const token_type = self.peekNextTokenType() catch |e| switch (e) {
error.BufferUnderrun => unreachable,
else => |err| return err,
};
switch (token_type) {
.number, .string => {
var value_list = ArrayList(u8).init(allocator);
errdefer {
value_list.deinit();
}
if (self.allocNextIntoArrayListMax(&value_list, when, max_value_len) catch |e| switch (e) {
error.BufferUnderrun => unreachable,
else => |err| return err,
}) |slice| {
return if (token_type == .number)
Token{ .number = slice }
else
Token{ .string = slice };
} else {
return if (token_type == .number)
Token{ .allocated_number = try value_list.toOwnedSlice() }
else
Token{ .allocated_string = try value_list.toOwnedSlice() };
}
},
// Simple tokens never alloc.
.object_begin,
.object_end,
.array_begin,
.array_end,
.true,
.false,
.null,
.end_of_document,
=> return self.next() catch |e| switch (e) {
error.BufferUnderrun => unreachable,
else => |err| return err,
},
}
}
|
allocNextIntoArrayList() Equivalent to |
pub fn allocNextIntoArrayList(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen) AllocIntoArrayListError!?[]const u8 {
return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len);
}
|
allocNextIntoArrayListMax() The next token type must be either |
pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen, max_value_len: usize) AllocIntoArrayListError!?[]const u8 {
while (true) {
const token = try self.next();
switch (token) {
// Accumulate partial values.
.partial_number, .partial_string => |slice| {
try appendSlice(value_list, slice, max_value_len);
},
.partial_string_escaped_1 => |buf| {
try appendSlice(value_list, buf[0..], max_value_len);
},
.partial_string_escaped_2 => |buf| {
try appendSlice(value_list, buf[0..], max_value_len);
},
.partial_string_escaped_3 => |buf| {
try appendSlice(value_list, buf[0..], max_value_len);
},
.partial_string_escaped_4 => |buf| {
try appendSlice(value_list, buf[0..], max_value_len);
},
// Return complete values.
.number => |slice| {
if (when == .alloc_if_needed and value_list.items.len == 0) {
// No alloc necessary.
return slice;
}
try appendSlice(value_list, slice, max_value_len);
// The token is complete.
return null;
},
.string => |slice| {
if (when == .alloc_if_needed and value_list.items.len == 0) {
// No alloc necessary.
return slice;
}
try appendSlice(value_list, slice, max_value_len);
// The token is complete.
return null;
},
.object_begin,
.object_end,
.array_begin,
.array_end,
.true,
.false,
.null,
.end_of_document,
=> unreachable, // Only .number and .string token types are allowed here. Check peekNextTokenType() before calling this.
.allocated_number, .allocated_string => unreachable,
}
}
}
|
skipValue() This function is only available after |
pub fn skipValue(self: *@This()) SkipError!void {
assert(self.is_end_of_input); // This function is not available in streaming mode.
switch (self.peekNextTokenType() catch |e| switch (e) {
error.BufferUnderrun => unreachable,
else => |err| return err,
}) {
.object_begin, .array_begin => {
self.skipUntilStackHeight(self.stackHeight()) catch |e| switch (e) {
error.BufferUnderrun => unreachable,
else => |err| return err,
};
},
.number, .string => {
while (true) {
switch (self.next() catch |e| switch (e) {
error.BufferUnderrun => unreachable,
else => |err| return err,
}) {
.partial_number,
.partial_string,
.partial_string_escaped_1,
.partial_string_escaped_2,
.partial_string_escaped_3,
.partial_string_escaped_4,
=> continue,
.number, .string => break,
else => unreachable,
}
}
},
.true, .false, .null => {
_ = self.next() catch |e| switch (e) {
error.BufferUnderrun => unreachable,
else => |err| return err,
};
},
.object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token.
}
}
|
skipUntilStackHeight() Skip tokens until an |
pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) NextError!void {
while (true) {
switch (try self.next()) {
.object_end, .array_end => {
if (self.stackHeight() == terminal_stack_height) break;
},
.end_of_document => unreachable,
else => continue,
}
}
}
|
stackHeight() The depth of |
pub fn stackHeight(self: *const @This()) usize {
return self.stack.bit_len;
}
|
ensureTotalStackCapacity() Pre allocate memory to hold the given number of nesting levels. |
pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void {
try self.stack.ensureTotalCapacity(height);
}
|
next() See |
pub fn next(self: *@This()) NextError!Token {
state_loop: while (true) {
switch (self.state) {
.value => {
switch (try self.skipWhitespaceExpectByte()) {
// Object, Array
'{' => {
try self.stack.push(OBJECT_MODE);
self.cursor += 1;
self.state = .object_start;
return .object_begin;
},
'[' => {
try self.stack.push(ARRAY_MODE);
self.cursor += 1;
self.state = .array_start;
return .array_begin;
},
// String
'"' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
continue :state_loop;
},
// Number
'1'...'9' => {
self.value_start = self.cursor;
self.cursor += 1;
self.state = .number_int;
continue :state_loop;
},
'0' => {
self.value_start = self.cursor;
self.cursor += 1;
self.state = .number_leading_zero;
continue :state_loop;
},
'-' => {
self.value_start = self.cursor;
self.cursor += 1;
self.state = .number_minus;
continue :state_loop;
},
// literal values
't' => {
self.cursor += 1;
self.state = .literal_t;
continue :state_loop;
},
'f' => {
self.cursor += 1;
self.state = .literal_f;
continue :state_loop;
},
'n' => {
self.cursor += 1;
self.state = .literal_n;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.post_value => {
if (try self.skipWhitespaceCheckEnd()) return .end_of_document;
const c = self.input[self.cursor];
if (self.string_is_object_key) {
self.string_is_object_key = false;
switch (c) {
':' => {
self.cursor += 1;
self.state = .value;
continue :state_loop;
},
else => return error.SyntaxError,
}
}
switch (c) {
'}' => {
if (self.stack.pop() != OBJECT_MODE) return error.SyntaxError;
self.cursor += 1;
// stay in .post_value state.
return .object_end;
},
']' => {
if (self.stack.pop() != ARRAY_MODE) return error.SyntaxError;
self.cursor += 1;
// stay in .post_value state.
return .array_end;
},
',' => {
switch (self.stack.peek()) {
OBJECT_MODE => {
self.state = .object_post_comma;
},
ARRAY_MODE => {
self.state = .value;
},
}
self.cursor += 1;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.object_start => {
switch (try self.skipWhitespaceExpectByte()) {
'"' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
self.string_is_object_key = true;
continue :state_loop;
},
'}' => {
self.cursor += 1;
_ = self.stack.pop();
self.state = .post_value;
return .object_end;
},
else => return error.SyntaxError,
}
},
.object_post_comma => {
switch (try self.skipWhitespaceExpectByte()) {
'"' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
self.string_is_object_key = true;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.array_start => {
switch (try self.skipWhitespaceExpectByte()) {
']' => {
self.cursor += 1;
_ = self.stack.pop();
self.state = .post_value;
return .array_end;
},
else => {
self.state = .value;
continue :state_loop;
},
}
},
.number_minus => {
if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
switch (self.input[self.cursor]) {
'0' => {
self.cursor += 1;
self.state = .number_leading_zero;
continue :state_loop;
},
'1'...'9' => {
self.cursor += 1;
self.state = .number_int;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.number_leading_zero => {
if (self.cursor >= self.input.len) return self.endOfBufferInNumber(true);
switch (self.input[self.cursor]) {
'.' => {
self.cursor += 1;
self.state = .number_post_dot;
continue :state_loop;
},
'e', 'E' => {
self.cursor += 1;
self.state = .number_post_e;
continue :state_loop;
},
else => {
self.state = .post_value;
return Token{ .number = self.takeValueSlice() };
},
}
},
.number_int => {
while (self.cursor < self.input.len) : (self.cursor += 1) {
switch (self.input[self.cursor]) {
'0'...'9' => continue,
'.' => {
self.cursor += 1;
self.state = .number_post_dot;
continue :state_loop;
},
'e', 'E' => {
self.cursor += 1;
self.state = .number_post_e;
continue :state_loop;
},
else => {
self.state = .post_value;
return Token{ .number = self.takeValueSlice() };
},
}
}
return self.endOfBufferInNumber(true);
},
.number_post_dot => {
if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
switch (try self.expectByte()) {
'0'...'9' => {
self.cursor += 1;
self.state = .number_frac;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.number_frac => {
while (self.cursor < self.input.len) : (self.cursor += 1) {
switch (self.input[self.cursor]) {
'0'...'9' => continue,
'e', 'E' => {
self.cursor += 1;
self.state = .number_post_e;
continue :state_loop;
},
else => {
self.state = .post_value;
return Token{ .number = self.takeValueSlice() };
},
}
}
return self.endOfBufferInNumber(true);
},
.number_post_e => {
if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
switch (self.input[self.cursor]) {
'0'...'9' => {
self.cursor += 1;
self.state = .number_exp;
continue :state_loop;
},
'+', '-' => {
self.cursor += 1;
self.state = .number_post_e_sign;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.number_post_e_sign => {
if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false);
switch (self.input[self.cursor]) {
'0'...'9' => {
self.cursor += 1;
self.state = .number_exp;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.number_exp => {
while (self.cursor < self.input.len) : (self.cursor += 1) {
switch (self.input[self.cursor]) {
'0'...'9' => continue,
else => {
self.state = .post_value;
return Token{ .number = self.takeValueSlice() };
},
}
}
return self.endOfBufferInNumber(true);
},
.string => {
while (self.cursor < self.input.len) : (self.cursor += 1) {
switch (self.input[self.cursor]) {
0...0x1f => return error.SyntaxError, // Bare ASCII control code in string.
// ASCII plain text.
0x20...('"' - 1), ('"' + 1)...('\\' - 1), ('\\' + 1)...0x7F => continue,
// Special characters.
'"' => {
const result = Token{ .string = self.takeValueSlice() };
self.cursor += 1;
self.state = .post_value;
return result;
},
'\\' => {
const slice = self.takeValueSlice();
self.cursor += 1;
self.state = .string_backslash;
if (slice.len > 0) return Token{ .partial_string = slice };
continue :state_loop;
},
// UTF-8 validation.
// See http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
0xC2...0xDF => {
self.cursor += 1;
self.state = .string_utf8_last_byte;
continue :state_loop;
},
0xE0 => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte_guard_against_overlong;
continue :state_loop;
},
0xE1...0xEC, 0xEE...0xEF => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte;
continue :state_loop;
},
0xED => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte_guard_against_surrogate_half;
continue :state_loop;
},
0xF0 => {
self.cursor += 1;
self.state = .string_utf8_third_to_last_byte_guard_against_overlong;
continue :state_loop;
},
0xF1...0xF3 => {
self.cursor += 1;
self.state = .string_utf8_third_to_last_byte;
continue :state_loop;
},
0xF4 => {
self.cursor += 1;
self.state = .string_utf8_third_to_last_byte_guard_against_too_large;
continue :state_loop;
},
0x80...0xC1, 0xF5...0xFF => return error.SyntaxError, // Invalid UTF-8.
}
}
if (self.is_end_of_input) return error.UnexpectedEndOfInput;
const slice = self.takeValueSlice();
if (slice.len > 0) return Token{ .partial_string = slice };
return error.BufferUnderrun;
},
.string_backslash => {
switch (try self.expectByte()) {
'"', '\\', '/' => {
// Since these characters now represent themselves literally,
// we can simply begin the next plaintext slice here.
self.value_start = self.cursor;
self.cursor += 1;
self.state = .string;
continue :state_loop;
},
'b' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
return Token{ .partial_string_escaped_1 = [_]u8{0x08} };
},
'f' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
return Token{ .partial_string_escaped_1 = [_]u8{0x0c} };
},
'n' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
return Token{ .partial_string_escaped_1 = [_]u8{'\n'} };
},
'r' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
return Token{ .partial_string_escaped_1 = [_]u8{'\r'} };
},
't' => {
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
return Token{ .partial_string_escaped_1 = [_]u8{'\t'} };
},
'u' => {
self.cursor += 1;
self.state = .string_backslash_u;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.string_backslash_u => {
const c = try self.expectByte();
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] = @as(u16, c - '0') << 12;
},
'A'...'F' => {
self.utf16_code_units[0] = @as(u16, c - 'A' + 10) << 12;
},
'a'...'f' => {
self.utf16_code_units[0] = @as(u16, c - 'a' + 10) << 12;
},
else => return error.SyntaxError,
}
self.cursor += 1;
self.state = .string_backslash_u_1;
continue :state_loop;
},
.string_backslash_u_1 => {
const c = try self.expectByte();
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] |= @as(u16, c - '0') << 8;
},
'A'...'F' => {
self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 8;
},
'a'...'f' => {
self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 8;
},
else => return error.SyntaxError,
}
self.cursor += 1;
self.state = .string_backslash_u_2;
continue :state_loop;
},
.string_backslash_u_2 => {
const c = try self.expectByte();
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] |= @as(u16, c - '0') << 4;
},
'A'...'F' => {
self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 4;
},
'a'...'f' => {
self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 4;
},
else => return error.SyntaxError,
}
self.cursor += 1;
self.state = .string_backslash_u_3;
continue :state_loop;
},
.string_backslash_u_3 => {
const c = try self.expectByte();
switch (c) {
'0'...'9' => {
self.utf16_code_units[0] |= c - '0';
},
'A'...'F' => {
self.utf16_code_units[0] |= c - 'A' + 10;
},
'a'...'f' => {
self.utf16_code_units[0] |= c - 'a' + 10;
},
else => return error.SyntaxError,
}
self.cursor += 1;
if (std.unicode.utf16IsHighSurrogate(self.utf16_code_units[0])) {
self.state = .string_surrogate_half;
continue :state_loop;
} else if (std.unicode.utf16IsLowSurrogate(self.utf16_code_units[0])) {
return error.SyntaxError; // Unexpected low surrogate half.
} else {
self.value_start = self.cursor;
self.state = .string;
return partialStringCodepoint(self.utf16_code_units[0]);
}
},
.string_surrogate_half => {
switch (try self.expectByte()) {
'\\' => {
self.cursor += 1;
self.state = .string_surrogate_half_backslash;
continue :state_loop;
},
else => return error.SyntaxError, // Expected low surrogate half.
}
},
.string_surrogate_half_backslash => {
switch (try self.expectByte()) {
'u' => {
self.cursor += 1;
self.state = .string_surrogate_half_backslash_u;
continue :state_loop;
},
else => return error.SyntaxError, // Expected low surrogate half.
}
},
.string_surrogate_half_backslash_u => {
switch (try self.expectByte()) {
'D', 'd' => {
self.cursor += 1;
self.utf16_code_units[1] = 0xD << 12;
self.state = .string_surrogate_half_backslash_u_1;
continue :state_loop;
},
else => return error.SyntaxError, // Expected low surrogate half.
}
},
.string_surrogate_half_backslash_u_1 => {
const c = try self.expectByte();
switch (c) {
'C'...'F' => {
self.cursor += 1;
self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 8;
self.state = .string_surrogate_half_backslash_u_2;
continue :state_loop;
},
'c'...'f' => {
self.cursor += 1;
self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 8;
self.state = .string_surrogate_half_backslash_u_2;
continue :state_loop;
},
else => return error.SyntaxError, // Expected low surrogate half.
}
},
.string_surrogate_half_backslash_u_2 => {
const c = try self.expectByte();
switch (c) {
'0'...'9' => {
self.cursor += 1;
self.utf16_code_units[1] |= @as(u16, c - '0') << 4;
self.state = .string_surrogate_half_backslash_u_3;
continue :state_loop;
},
'A'...'F' => {
self.cursor += 1;
self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 4;
self.state = .string_surrogate_half_backslash_u_3;
continue :state_loop;
},
'a'...'f' => {
self.cursor += 1;
self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 4;
self.state = .string_surrogate_half_backslash_u_3;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.string_surrogate_half_backslash_u_3 => {
const c = try self.expectByte();
switch (c) {
'0'...'9' => {
self.utf16_code_units[1] |= c - '0';
},
'A'...'F' => {
self.utf16_code_units[1] |= c - 'A' + 10;
},
'a'...'f' => {
self.utf16_code_units[1] |= c - 'a' + 10;
},
else => return error.SyntaxError,
}
self.cursor += 1;
self.value_start = self.cursor;
self.state = .string;
const code_point = std.unicode.utf16DecodeSurrogatePair(&self.utf16_code_units) catch unreachable;
return partialStringCodepoint(code_point);
},
.string_utf8_last_byte => {
switch (try self.expectByte()) {
0x80...0xBF => {
self.cursor += 1;
self.state = .string;
continue :state_loop;
},
else => return error.SyntaxError, // Invalid UTF-8.
}
},
.string_utf8_second_to_last_byte => {
switch (try self.expectByte()) {
0x80...0xBF => {
self.cursor += 1;
self.state = .string_utf8_last_byte;
continue :state_loop;
},
else => return error.SyntaxError, // Invalid UTF-8.
}
},
.string_utf8_second_to_last_byte_guard_against_overlong => {
switch (try self.expectByte()) {
0xA0...0xBF => {
self.cursor += 1;
self.state = .string_utf8_last_byte;
continue :state_loop;
},
else => return error.SyntaxError, // Invalid UTF-8.
}
},
.string_utf8_second_to_last_byte_guard_against_surrogate_half => {
switch (try self.expectByte()) {
0x80...0x9F => {
self.cursor += 1;
self.state = .string_utf8_last_byte;
continue :state_loop;
},
else => return error.SyntaxError, // Invalid UTF-8.
}
},
.string_utf8_third_to_last_byte => {
switch (try self.expectByte()) {
0x80...0xBF => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte;
continue :state_loop;
},
else => return error.SyntaxError, // Invalid UTF-8.
}
},
.string_utf8_third_to_last_byte_guard_against_overlong => {
switch (try self.expectByte()) {
0x90...0xBF => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte;
continue :state_loop;
},
else => return error.SyntaxError, // Invalid UTF-8.
}
},
.string_utf8_third_to_last_byte_guard_against_too_large => {
switch (try self.expectByte()) {
0x80...0x8F => {
self.cursor += 1;
self.state = .string_utf8_second_to_last_byte;
continue :state_loop;
},
else => return error.SyntaxError, // Invalid UTF-8.
}
},
.literal_t => {
switch (try self.expectByte()) {
'r' => {
self.cursor += 1;
self.state = .literal_tr;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.literal_tr => {
switch (try self.expectByte()) {
'u' => {
self.cursor += 1;
self.state = .literal_tru;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.literal_tru => {
switch (try self.expectByte()) {
'e' => {
self.cursor += 1;
self.state = .post_value;
return .true;
},
else => return error.SyntaxError,
}
},
.literal_f => {
switch (try self.expectByte()) {
'a' => {
self.cursor += 1;
self.state = .literal_fa;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.literal_fa => {
switch (try self.expectByte()) {
'l' => {
self.cursor += 1;
self.state = .literal_fal;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.literal_fal => {
switch (try self.expectByte()) {
's' => {
self.cursor += 1;
self.state = .literal_fals;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.literal_fals => {
switch (try self.expectByte()) {
'e' => {
self.cursor += 1;
self.state = .post_value;
return .false;
},
else => return error.SyntaxError,
}
},
.literal_n => {
switch (try self.expectByte()) {
'u' => {
self.cursor += 1;
self.state = .literal_nu;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.literal_nu => {
switch (try self.expectByte()) {
'l' => {
self.cursor += 1;
self.state = .literal_nul;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.literal_nul => {
switch (try self.expectByte()) {
'l' => {
self.cursor += 1;
self.state = .post_value;
return .null;
},
else => return error.SyntaxError,
}
},
}
unreachable;
}
}
|
peekNextTokenType() Seeks ahead in the input until the first byte of the next token (or the end of the input) determines which type of token will be returned from the next |
pub fn peekNextTokenType(self: *@This()) PeekError!TokenType {
state_loop: while (true) {
switch (self.state) {
.value => {
switch (try self.skipWhitespaceExpectByte()) {
'{' => return .object_begin,
'[' => return .array_begin,
'"' => return .string,
'-', '0'...'9' => return .number,
't' => return .true,
'f' => return .false,
'n' => return .null,
else => return error.SyntaxError,
}
},
.post_value => {
if (try self.skipWhitespaceCheckEnd()) return .end_of_document;
const c = self.input[self.cursor];
if (self.string_is_object_key) {
self.string_is_object_key = false;
switch (c) {
':' => {
self.cursor += 1;
self.state = .value;
continue :state_loop;
},
else => return error.SyntaxError,
}
}
switch (c) {
'}' => return .object_end,
']' => return .array_end,
',' => {
switch (self.stack.peek()) {
OBJECT_MODE => {
self.state = .object_post_comma;
},
ARRAY_MODE => {
self.state = .value;
},
}
self.cursor += 1;
continue :state_loop;
},
else => return error.SyntaxError,
}
},
.object_start => {
switch (try self.skipWhitespaceExpectByte()) {
'"' => return .string,
'}' => return .object_end,
else => return error.SyntaxError,
}
},
.object_post_comma => {
switch (try self.skipWhitespaceExpectByte()) {
'"' => return .string,
else => return error.SyntaxError,
}
},
.array_start => {
switch (try self.skipWhitespaceExpectByte()) {
']' => return .array_end,
else => {
self.state = .value;
continue :state_loop;
},
}
},
.number_minus,
.number_leading_zero,
.number_int,
.number_post_dot,
.number_frac,
.number_post_e,
.number_post_e_sign,
.number_exp,
=> return .number,
.string,
.string_backslash,
.string_backslash_u,
.string_backslash_u_1,
.string_backslash_u_2,
.string_backslash_u_3,
.string_surrogate_half,
.string_surrogate_half_backslash,
.string_surrogate_half_backslash_u,
.string_surrogate_half_backslash_u_1,
.string_surrogate_half_backslash_u_2,
.string_surrogate_half_backslash_u_3,
=> return .string,
.string_utf8_last_byte,
.string_utf8_second_to_last_byte,
.string_utf8_second_to_last_byte_guard_against_overlong,
.string_utf8_second_to_last_byte_guard_against_surrogate_half,
.string_utf8_third_to_last_byte,
.string_utf8_third_to_last_byte_guard_against_overlong,
.string_utf8_third_to_last_byte_guard_against_too_large,
=> return .string,
.literal_t,
.literal_tr,
.literal_tru,
=> return .true,
.literal_f,
.literal_fa,
.literal_fal,
.literal_fals,
=> return .false,
.literal_n,
.literal_nu,
.literal_nul,
=> return .null,
}
unreachable;
}
}
const State = enum {
value,
post_value,
object_start,
object_post_comma,
array_start,
number_minus,
number_leading_zero,
number_int,
number_post_dot,
number_frac,
number_post_e,
number_post_e_sign,
number_exp,
string,
string_backslash,
string_backslash_u,
string_backslash_u_1,
string_backslash_u_2,
string_backslash_u_3,
string_surrogate_half,
string_surrogate_half_backslash,
string_surrogate_half_backslash_u,
string_surrogate_half_backslash_u_1,
string_surrogate_half_backslash_u_2,
string_surrogate_half_backslash_u_3,
// From http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
string_utf8_last_byte, // State A
string_utf8_second_to_last_byte, // State B
string_utf8_second_to_last_byte_guard_against_overlong, // State C
string_utf8_second_to_last_byte_guard_against_surrogate_half, // State D
string_utf8_third_to_last_byte, // State E
string_utf8_third_to_last_byte_guard_against_overlong, // State F
string_utf8_third_to_last_byte_guard_against_too_large, // State G
literal_t,
literal_tr,
literal_tru,
literal_f,
literal_fa,
literal_fal,
literal_fals,
literal_n,
literal_nu,
literal_nul,
};
fn expectByte(self: *const @This()) !u8 {
if (self.cursor < self.input.len) {
return self.input[self.cursor];
}
// No byte.
if (self.is_end_of_input) return error.UnexpectedEndOfInput;
return error.BufferUnderrun;
}
fn skipWhitespace(self: *@This()) void {
while (self.cursor < self.input.len) : (self.cursor += 1) {
switch (self.input[self.cursor]) {
// Whitespace
' ', '\t', '\r' => continue,
'\n' => {
if (self.diagnostics) |diag| {
diag.line_number += 1;
// This will count the newline itself,
// which means a straight-forward subtraction will give a 1-based column number.
diag.line_start_cursor = self.cursor;
}
continue;
},
else => return,
}
}
}
fn skipWhitespaceExpectByte(self: *@This()) !u8 {
self.skipWhitespace();
return self.expectByte();
}
fn skipWhitespaceCheckEnd(self: *@This()) !bool {
self.skipWhitespace();
if (self.cursor >= self.input.len) {
// End of buffer.
if (self.is_end_of_input) {
// End of everything.
if (self.stackHeight() == 0) {
// We did it!
return true;
}
return error.UnexpectedEndOfInput;
}
return error.BufferUnderrun;
}
if (self.stackHeight() == 0) return error.SyntaxError;
return false;
}
fn takeValueSlice(self: *@This()) []const u8 {
const slice = self.input[self.value_start..self.cursor];
self.value_start = self.cursor;
return slice;
}
fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token {
const slice = self.takeValueSlice();
if (self.is_end_of_input) {
if (!allow_end) return error.UnexpectedEndOfInput;
self.state = .post_value;
return Token{ .number = slice };
}
if (slice.len == 0) return error.BufferUnderrun;
return Token{ .partial_number = slice };
}
fn partialStringCodepoint(code_point: u21) Token {
var buf: [4]u8 = undefined;
switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) {
1 => return Token{ .partial_string_escaped_1 = buf[0..1].* },
2 => return Token{ .partial_string_escaped_2 = buf[0..2].* },
3 => return Token{ .partial_string_escaped_3 = buf[0..3].* },
4 => return Token{ .partial_string_escaped_4 = buf[0..4].* },
else => unreachable,
}
}
};
const OBJECT_MODE = 0;
const ARRAY_MODE = 1;
fn appendSlice(list: *std.ArrayList(u8), buf: []const u8, max_value_len: usize) !void {
const new_len = std.math.add(usize, list.items.len, buf.len) catch return error.ValueTooLong;
if (new_len > max_value_len) return error.ValueTooLong;
try list.appendSlice(buf);
}
|
isNumberFormattedLikeAnInteger() For the slice you get from a |
pub fn isNumberFormattedLikeAnInteger(value: []const u8) bool {
if (std.mem.eql(u8, value, "-0")) return false;
return std.mem.indexOfAny(u8, value, ".eE") == null;
}
test {
_ = @import("./scanner_test.zig");
}
|
| Generated by zstd-browse2 on 2023-11-04 14:12:19 -0400. |