|
// Notes on standards compliance: https://datatracker.ietf.org/doc/html/rfc8259 // * RFC 8259 requires JSON documents be valid UTF-8, // but makes an allowance for systems that are "part of a closed ecosystem". // I have no idea what that's supposed to mean in the context of a standard specification. // This implementation requires inputs to be valid UTF-8. // * RFC 8259 contradicts itself regarding whether lowercase is allowed in \u hex digits, // but this is probably a bug in the spec, and it's clear that lowercase is meant to be allowed. // (RFC 5234 defines HEXDIG to only allow uppercase.) // * When RFC 8259 refers to a "character", I assume they really mean a "Unicode scalar value". // See http://www.unicode.org/glossary/#unicode_scalar_value . // * RFC 8259 doesn't explicitly disallow unpaired surrogate halves in \u escape sequences, // but vaguely implies that \u escapes are for encoding Unicode "characters" (i.e. Unicode scalar values?), // which would mean that unpaired surrogate halves are forbidden. // By contrast ECMA-404 (a competing(/compatible?) JSON standard, which JavaScript's JSON.parse() conforms to) // explicitly allows unpaired surrogate halves. // This implementation forbids unpaired surrogate halves in \u sequences. // If a high surrogate half appears in a \u sequence, // then a low surrogate half must immediately follow in \u notation. // * RFC 8259 allows implementations to "accept non-JSON forms or extensions". // This implementation does not accept any of that. // * RFC 8259 allows implementations to put limits on "the size of texts", // "the maximum depth of nesting", "the range and precision of numbers", // and "the length and character contents of strings". // This low-level implementation does not limit these, // except where noted above, and except that nesting depth requires memory allocation. // Note that this low-level API does not interpret numbers numerically, // but simply emits their source form for some higher level code to make sense of. // * This low-level implementation allows duplicate object keys, // and key/value pairs are emitted in the order they appear in the input. const std = @import("std"); const Allocator = std.mem.Allocator; const ArrayList = std.ArrayList; const assert = std.debug.assert; const BitStack = std.BitStack; |
validate() Scan the input and check for malformed JSON. On |
pub fn validate(allocator: Allocator, s: []const u8) Allocator.Error!bool { var scanner = Scanner.initCompleteInput(allocator, s); defer scanner.deinit(); while (true) { const token = scanner.next() catch |err| switch (err) { error.SyntaxError, error.UnexpectedEndOfInput => return false, error.OutOfMemory => return error.OutOfMemory, error.BufferUnderrun => unreachable, }; if (token == .end_of_document) break; } return true; } |
Error The parsing errors are divided into two categories: * |
pub const Error = error{ SyntaxError, UnexpectedEndOfInput }; |
reader() Calls |
pub fn reader(allocator: Allocator, io_reader: anytype) Reader(default_buffer_size, @TypeOf(io_reader)) { return Reader(default_buffer_size, @TypeOf(io_reader)).init(allocator, io_reader); } |
default_buffer_size Used by |
pub const default_buffer_size = 0x1000; |
Token The tokens emitted by |
pub const Token = union(enum) { object_begin, object_end, array_begin, array_end, true, false, null, number: []const u8, partial_number: []const u8, allocated_number: []u8, string: []const u8, partial_string: []const u8, partial_string_escaped_1: [1]u8, partial_string_escaped_2: [2]u8, partial_string_escaped_3: [3]u8, partial_string_escaped_4: [4]u8, allocated_string: []u8, end_of_document, }; |
TokenType This is only used in |
pub const TokenType = enum { object_begin, object_end, array_begin, array_end, true, false, null, number, string, end_of_document, }; |
Diagnostics To enable diagnostics, declare |
pub const Diagnostics = struct { line_number: u64 = 1, line_start_cursor: usize = @as(usize, @bitCast(@as(isize, -1))), // Start just "before" the input buffer to get a 1-based column for line 1. total_bytes_before_current_input: u64 = 0, cursor_pointer: *const usize = undefined, |
getLine()Starts at 1. |
pub fn getLine(self: *const @This()) u64 { return self.line_number; } |
getColumn()Starts at 1. |
pub fn getColumn(self: *const @This()) u64 { return self.cursor_pointer.* -% self.line_start_cursor; } |
getByteOffset()Starts at 0. Measures the byte offset since the start of the input. |
pub fn getByteOffset(self: *const @This()) u64 { return self.total_bytes_before_current_input + self.cursor_pointer.*; } }; |
AllocWhen See the documentation for |
pub const AllocWhen = enum { alloc_if_needed, alloc_always }; |
default_max_value_len For security, the maximum size allocated to store a single string or number value is limited to 4MiB by default. This limit can be specified by calling |
pub const default_max_value_len = 4 * 1024 * 1024; |
Reader() Connects a |
pub fn Reader(comptime buffer_size: usize, comptime ReaderType: type) type { return struct { scanner: Scanner, reader: ReaderType, buffer: [buffer_size]u8 = undefined, |
init() The allocator is only used to track |
pub fn init(allocator: Allocator, io_reader: ReaderType) @This() { return .{ .scanner = Scanner.initStreaming(allocator), .reader = io_reader, }; } |
deinit() |
pub fn deinit(self: *@This()) void { self.scanner.deinit(); self.* = undefined; } |
enableDiagnostics() Calls |
pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void { self.scanner.enableDiagnostics(diagnostics); } pub const NextError = ReaderType.Error || Error || Allocator.Error; pub const SkipError = NextError; pub const AllocError = NextError || error{ValueTooLong}; pub const PeekError = ReaderType.Error || Error; |
nextAlloc() Equivalent to |
pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) AllocError!Token { return self.nextAllocMax(allocator, when, default_max_value_len); } |
nextAllocMax() See also |
pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) AllocError!Token { const token_type = try self.peekNextTokenType(); switch (token_type) { .number, .string => { var value_list = ArrayList(u8).init(allocator); errdefer { value_list.deinit(); } if (try self.allocNextIntoArrayListMax(&value_list, when, max_value_len)) |slice| { return if (token_type == .number) Token{ .number = slice } else Token{ .string = slice }; } else { return if (token_type == .number) Token{ .allocated_number = try value_list.toOwnedSlice() } else Token{ .allocated_string = try value_list.toOwnedSlice() }; } }, // Simple tokens never alloc. .object_begin, .object_end, .array_begin, .array_end, .true, .false, .null, .end_of_document, => return try self.next(), } } |
allocNextIntoArrayList() Equivalent to |
pub fn allocNextIntoArrayList(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen) AllocError!?[]const u8 { return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len); } |
allocNextIntoArrayListMax() Calls |
pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen, max_value_len: usize) AllocError!?[]const u8 { while (true) { return self.scanner.allocNextIntoArrayListMax(value_list, when, max_value_len) catch |err| switch (err) { error.BufferUnderrun => { try self.refillBuffer(); continue; }, else => |other_err| return other_err, }; } } |
skipValue() Like |
pub fn skipValue(self: *@This()) SkipError!void { switch (try self.peekNextTokenType()) { .object_begin, .array_begin => { try self.skipUntilStackHeight(self.stackHeight()); }, .number, .string => { while (true) { switch (try self.next()) { .partial_number, .partial_string, .partial_string_escaped_1, .partial_string_escaped_2, .partial_string_escaped_3, .partial_string_escaped_4, => continue, .number, .string => break, else => unreachable, } } }, .true, .false, .null => { _ = try self.next(); }, .object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token. } } |
skipUntilStackHeight() Like |
pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) NextError!void { while (true) { return self.scanner.skipUntilStackHeight(terminal_stack_height) catch |err| switch (err) { error.BufferUnderrun => { try self.refillBuffer(); continue; }, else => |other_err| return other_err, }; } } |
stackHeight() Calls |
pub fn stackHeight(self: *const @This()) usize { return self.scanner.stackHeight(); } |
ensureTotalStackCapacity() Calls |
pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void { try self.scanner.ensureTotalStackCapacity(height); } |
next() See |
pub fn next(self: *@This()) NextError!Token { while (true) { return self.scanner.next() catch |err| switch (err) { error.BufferUnderrun => { try self.refillBuffer(); continue; }, else => |other_err| return other_err, }; } } |
peekNextTokenType() See |
pub fn peekNextTokenType(self: *@This()) PeekError!TokenType { while (true) { return self.scanner.peekNextTokenType() catch |err| switch (err) { error.BufferUnderrun => { try self.refillBuffer(); continue; }, else => |other_err| return other_err, }; } } fn refillBuffer(self: *@This()) ReaderType.Error!void { const input = self.buffer[0..try self.reader.read(self.buffer[0..])]; if (input.len > 0) { self.scanner.feedInput(input); } else { self.scanner.endInput(); } } }; } |
Scanner The lowest level parsing API in this package; supports streaming input with a low memory footprint. The memory requirement is |
pub const Scanner = struct { state: State = .value, string_is_object_key: bool = false, stack: BitStack, value_start: usize = undefined, utf16_code_units: [2]u16 = undefined, input: []const u8 = "", cursor: usize = 0, is_end_of_input: bool = false, diagnostics: ?*Diagnostics = null, |
initStreaming() The allocator is only used to track |
pub fn initStreaming(allocator: Allocator) @This() { return .{ .stack = BitStack.init(allocator), }; } |
initCompleteInput()Use this if your input is a single slice. This is effectively equivalent to: ``` initStreaming(allocator); feedInput(complete_input); endInput(); ``` |
pub fn initCompleteInput(allocator: Allocator, complete_input: []const u8) @This() { return .{ .stack = BitStack.init(allocator), .input = complete_input, .is_end_of_input = true, }; } |
deinit() |
pub fn deinit(self: *@This()) void { self.stack.deinit(); self.* = undefined; } |
enableDiagnostics() |
pub fn enableDiagnostics(self: *@This(), diagnostics: *Diagnostics) void { diagnostics.cursor_pointer = &self.cursor; self.diagnostics = diagnostics; } |
feedInput() Call this whenever you get |
pub fn feedInput(self: *@This(), input: []const u8) void { assert(self.cursor == self.input.len); // Not done with the last input slice. if (self.diagnostics) |diag| { diag.total_bytes_before_current_input += self.input.len; // This usually goes "negative" to measure how far before the beginning // of the new buffer the current line started. diag.line_start_cursor -%= self.cursor; } self.input = input; self.cursor = 0; self.value_start = 0; } |
endInput() Call this when you will no longer call |
pub fn endInput(self: *@This()) void { self.is_end_of_input = true; } pub const NextError = Error || Allocator.Error || error{BufferUnderrun}; pub const AllocError = Error || Allocator.Error || error{ValueTooLong}; pub const PeekError = Error || error{BufferUnderrun}; pub const SkipError = Error || Allocator.Error; pub const AllocIntoArrayListError = AllocError || error{BufferUnderrun}; |
nextAlloc() Equivalent to |
pub fn nextAlloc(self: *@This(), allocator: Allocator, when: AllocWhen) AllocError!Token { return self.nextAllocMax(allocator, when, default_max_value_len); } |
nextAllocMax() This function is only available after |
pub fn nextAllocMax(self: *@This(), allocator: Allocator, when: AllocWhen, max_value_len: usize) AllocError!Token { assert(self.is_end_of_input); // This function is not available in streaming mode. const token_type = self.peekNextTokenType() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }; switch (token_type) { .number, .string => { var value_list = ArrayList(u8).init(allocator); errdefer { value_list.deinit(); } if (self.allocNextIntoArrayListMax(&value_list, when, max_value_len) catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }) |slice| { return if (token_type == .number) Token{ .number = slice } else Token{ .string = slice }; } else { return if (token_type == .number) Token{ .allocated_number = try value_list.toOwnedSlice() } else Token{ .allocated_string = try value_list.toOwnedSlice() }; } }, // Simple tokens never alloc. .object_begin, .object_end, .array_begin, .array_end, .true, .false, .null, .end_of_document, => return self.next() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }, } } |
allocNextIntoArrayList() Equivalent to |
pub fn allocNextIntoArrayList(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen) AllocIntoArrayListError!?[]const u8 { return self.allocNextIntoArrayListMax(value_list, when, default_max_value_len); } |
allocNextIntoArrayListMax() The next token type must be either |
pub fn allocNextIntoArrayListMax(self: *@This(), value_list: *ArrayList(u8), when: AllocWhen, max_value_len: usize) AllocIntoArrayListError!?[]const u8 { while (true) { const token = try self.next(); switch (token) { // Accumulate partial values. .partial_number, .partial_string => |slice| { try appendSlice(value_list, slice, max_value_len); }, .partial_string_escaped_1 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, .partial_string_escaped_2 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, .partial_string_escaped_3 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, .partial_string_escaped_4 => |buf| { try appendSlice(value_list, buf[0..], max_value_len); }, // Return complete values. .number => |slice| { if (when == .alloc_if_needed and value_list.items.len == 0) { // No alloc necessary. return slice; } try appendSlice(value_list, slice, max_value_len); // The token is complete. return null; }, .string => |slice| { if (when == .alloc_if_needed and value_list.items.len == 0) { // No alloc necessary. return slice; } try appendSlice(value_list, slice, max_value_len); // The token is complete. return null; }, .object_begin, .object_end, .array_begin, .array_end, .true, .false, .null, .end_of_document, => unreachable, // Only .number and .string token types are allowed here. Check peekNextTokenType() before calling this. .allocated_number, .allocated_string => unreachable, } } } |
skipValue() This function is only available after |
pub fn skipValue(self: *@This()) SkipError!void { assert(self.is_end_of_input); // This function is not available in streaming mode. switch (self.peekNextTokenType() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }) { .object_begin, .array_begin => { self.skipUntilStackHeight(self.stackHeight()) catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }; }, .number, .string => { while (true) { switch (self.next() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }) { .partial_number, .partial_string, .partial_string_escaped_1, .partial_string_escaped_2, .partial_string_escaped_3, .partial_string_escaped_4, => continue, .number, .string => break, else => unreachable, } } }, .true, .false, .null => { _ = self.next() catch |e| switch (e) { error.BufferUnderrun => unreachable, else => |err| return err, }; }, .object_end, .array_end, .end_of_document => unreachable, // Attempt to skip a non-value token. } } |
skipUntilStackHeight() Skip tokens until an |
pub fn skipUntilStackHeight(self: *@This(), terminal_stack_height: usize) NextError!void { while (true) { switch (try self.next()) { .object_end, .array_end => { if (self.stackHeight() == terminal_stack_height) break; }, .end_of_document => unreachable, else => continue, } } } |
stackHeight() The depth of |
pub fn stackHeight(self: *const @This()) usize { return self.stack.bit_len; } |
ensureTotalStackCapacity() Pre allocate memory to hold the given number of nesting levels. |
pub fn ensureTotalStackCapacity(self: *@This(), height: usize) Allocator.Error!void { try self.stack.ensureTotalCapacity(height); } |
next() See |
pub fn next(self: *@This()) NextError!Token { state_loop: while (true) { switch (self.state) { .value => { switch (try self.skipWhitespaceExpectByte()) { // Object, Array '{' => { try self.stack.push(OBJECT_MODE); self.cursor += 1; self.state = .object_start; return .object_begin; }, '[' => { try self.stack.push(ARRAY_MODE); self.cursor += 1; self.state = .array_start; return .array_begin; }, // String '"' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; continue :state_loop; }, // Number '1'...'9' => { self.value_start = self.cursor; self.cursor += 1; self.state = .number_int; continue :state_loop; }, '0' => { self.value_start = self.cursor; self.cursor += 1; self.state = .number_leading_zero; continue :state_loop; }, '-' => { self.value_start = self.cursor; self.cursor += 1; self.state = .number_minus; continue :state_loop; }, // literal values 't' => { self.cursor += 1; self.state = .literal_t; continue :state_loop; }, 'f' => { self.cursor += 1; self.state = .literal_f; continue :state_loop; }, 'n' => { self.cursor += 1; self.state = .literal_n; continue :state_loop; }, else => return error.SyntaxError, } }, .post_value => { if (try self.skipWhitespaceCheckEnd()) return .end_of_document; const c = self.input[self.cursor]; if (self.string_is_object_key) { self.string_is_object_key = false; switch (c) { ':' => { self.cursor += 1; self.state = .value; continue :state_loop; }, else => return error.SyntaxError, } } switch (c) { '}' => { if (self.stack.pop() != OBJECT_MODE) return error.SyntaxError; self.cursor += 1; // stay in .post_value state. return .object_end; }, ']' => { if (self.stack.pop() != ARRAY_MODE) return error.SyntaxError; self.cursor += 1; // stay in .post_value state. return .array_end; }, ',' => { switch (self.stack.peek()) { OBJECT_MODE => { self.state = .object_post_comma; }, ARRAY_MODE => { self.state = .value; }, } self.cursor += 1; continue :state_loop; }, else => return error.SyntaxError, } }, .object_start => { switch (try self.skipWhitespaceExpectByte()) { '"' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; self.string_is_object_key = true; continue :state_loop; }, '}' => { self.cursor += 1; _ = self.stack.pop(); self.state = .post_value; return .object_end; }, else => return error.SyntaxError, } }, .object_post_comma => { switch (try self.skipWhitespaceExpectByte()) { '"' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; self.string_is_object_key = true; continue :state_loop; }, else => return error.SyntaxError, } }, .array_start => { switch (try self.skipWhitespaceExpectByte()) { ']' => { self.cursor += 1; _ = self.stack.pop(); self.state = .post_value; return .array_end; }, else => { self.state = .value; continue :state_loop; }, } }, .number_minus => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (self.input[self.cursor]) { '0' => { self.cursor += 1; self.state = .number_leading_zero; continue :state_loop; }, '1'...'9' => { self.cursor += 1; self.state = .number_int; continue :state_loop; }, else => return error.SyntaxError, } }, .number_leading_zero => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(true); switch (self.input[self.cursor]) { '.' => { self.cursor += 1; self.state = .number_post_dot; continue :state_loop; }, 'e', 'E' => { self.cursor += 1; self.state = .number_post_e; continue :state_loop; }, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } }, .number_int => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { '0'...'9' => continue, '.' => { self.cursor += 1; self.state = .number_post_dot; continue :state_loop; }, 'e', 'E' => { self.cursor += 1; self.state = .number_post_e; continue :state_loop; }, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } } return self.endOfBufferInNumber(true); }, .number_post_dot => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (try self.expectByte()) { '0'...'9' => { self.cursor += 1; self.state = .number_frac; continue :state_loop; }, else => return error.SyntaxError, } }, .number_frac => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { '0'...'9' => continue, 'e', 'E' => { self.cursor += 1; self.state = .number_post_e; continue :state_loop; }, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } } return self.endOfBufferInNumber(true); }, .number_post_e => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (self.input[self.cursor]) { '0'...'9' => { self.cursor += 1; self.state = .number_exp; continue :state_loop; }, '+', '-' => { self.cursor += 1; self.state = .number_post_e_sign; continue :state_loop; }, else => return error.SyntaxError, } }, .number_post_e_sign => { if (self.cursor >= self.input.len) return self.endOfBufferInNumber(false); switch (self.input[self.cursor]) { '0'...'9' => { self.cursor += 1; self.state = .number_exp; continue :state_loop; }, else => return error.SyntaxError, } }, .number_exp => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { '0'...'9' => continue, else => { self.state = .post_value; return Token{ .number = self.takeValueSlice() }; }, } } return self.endOfBufferInNumber(true); }, .string => { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { 0...0x1f => return error.SyntaxError, // Bare ASCII control code in string. // ASCII plain text. 0x20...('"' - 1), ('"' + 1)...('\\' - 1), ('\\' + 1)...0x7F => continue, // Special characters. '"' => { const result = Token{ .string = self.takeValueSlice() }; self.cursor += 1; self.state = .post_value; return result; }, '\\' => { const slice = self.takeValueSlice(); self.cursor += 1; self.state = .string_backslash; if (slice.len > 0) return Token{ .partial_string = slice }; continue :state_loop; }, // UTF-8 validation. // See http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String 0xC2...0xDF => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, 0xE0 => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte_guard_against_overlong; continue :state_loop; }, 0xE1...0xEC, 0xEE...0xEF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, 0xED => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte_guard_against_surrogate_half; continue :state_loop; }, 0xF0 => { self.cursor += 1; self.state = .string_utf8_third_to_last_byte_guard_against_overlong; continue :state_loop; }, 0xF1...0xF3 => { self.cursor += 1; self.state = .string_utf8_third_to_last_byte; continue :state_loop; }, 0xF4 => { self.cursor += 1; self.state = .string_utf8_third_to_last_byte_guard_against_too_large; continue :state_loop; }, 0x80...0xC1, 0xF5...0xFF => return error.SyntaxError, // Invalid UTF-8. } } if (self.is_end_of_input) return error.UnexpectedEndOfInput; const slice = self.takeValueSlice(); if (slice.len > 0) return Token{ .partial_string = slice }; return error.BufferUnderrun; }, .string_backslash => { switch (try self.expectByte()) { '"', '\\', '/' => { // Since these characters now represent themselves literally, // we can simply begin the next plaintext slice here. self.value_start = self.cursor; self.cursor += 1; self.state = .string; continue :state_loop; }, 'b' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{0x08} }; }, 'f' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{0x0c} }; }, 'n' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{'\n'} }; }, 'r' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{'\r'} }; }, 't' => { self.cursor += 1; self.value_start = self.cursor; self.state = .string; return Token{ .partial_string_escaped_1 = [_]u8{'\t'} }; }, 'u' => { self.cursor += 1; self.state = .string_backslash_u; continue :state_loop; }, else => return error.SyntaxError, } }, .string_backslash_u => { const c = try self.expectByte(); switch (c) { '0'...'9' => { self.utf16_code_units[0] = @as(u16, c - '0') << 12; }, 'A'...'F' => { self.utf16_code_units[0] = @as(u16, c - 'A' + 10) << 12; }, 'a'...'f' => { self.utf16_code_units[0] = @as(u16, c - 'a' + 10) << 12; }, else => return error.SyntaxError, } self.cursor += 1; self.state = .string_backslash_u_1; continue :state_loop; }, .string_backslash_u_1 => { const c = try self.expectByte(); switch (c) { '0'...'9' => { self.utf16_code_units[0] |= @as(u16, c - '0') << 8; }, 'A'...'F' => { self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 8; }, 'a'...'f' => { self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 8; }, else => return error.SyntaxError, } self.cursor += 1; self.state = .string_backslash_u_2; continue :state_loop; }, .string_backslash_u_2 => { const c = try self.expectByte(); switch (c) { '0'...'9' => { self.utf16_code_units[0] |= @as(u16, c - '0') << 4; }, 'A'...'F' => { self.utf16_code_units[0] |= @as(u16, c - 'A' + 10) << 4; }, 'a'...'f' => { self.utf16_code_units[0] |= @as(u16, c - 'a' + 10) << 4; }, else => return error.SyntaxError, } self.cursor += 1; self.state = .string_backslash_u_3; continue :state_loop; }, .string_backslash_u_3 => { const c = try self.expectByte(); switch (c) { '0'...'9' => { self.utf16_code_units[0] |= c - '0'; }, 'A'...'F' => { self.utf16_code_units[0] |= c - 'A' + 10; }, 'a'...'f' => { self.utf16_code_units[0] |= c - 'a' + 10; }, else => return error.SyntaxError, } self.cursor += 1; if (std.unicode.utf16IsHighSurrogate(self.utf16_code_units[0])) { self.state = .string_surrogate_half; continue :state_loop; } else if (std.unicode.utf16IsLowSurrogate(self.utf16_code_units[0])) { return error.SyntaxError; // Unexpected low surrogate half. } else { self.value_start = self.cursor; self.state = .string; return partialStringCodepoint(self.utf16_code_units[0]); } }, .string_surrogate_half => { switch (try self.expectByte()) { '\\' => { self.cursor += 1; self.state = .string_surrogate_half_backslash; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash => { switch (try self.expectByte()) { 'u' => { self.cursor += 1; self.state = .string_surrogate_half_backslash_u; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash_u => { switch (try self.expectByte()) { 'D', 'd' => { self.cursor += 1; self.utf16_code_units[1] = 0xD << 12; self.state = .string_surrogate_half_backslash_u_1; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash_u_1 => { const c = try self.expectByte(); switch (c) { 'C'...'F' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 8; self.state = .string_surrogate_half_backslash_u_2; continue :state_loop; }, 'c'...'f' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 8; self.state = .string_surrogate_half_backslash_u_2; continue :state_loop; }, else => return error.SyntaxError, // Expected low surrogate half. } }, .string_surrogate_half_backslash_u_2 => { const c = try self.expectByte(); switch (c) { '0'...'9' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - '0') << 4; self.state = .string_surrogate_half_backslash_u_3; continue :state_loop; }, 'A'...'F' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'A' + 10) << 4; self.state = .string_surrogate_half_backslash_u_3; continue :state_loop; }, 'a'...'f' => { self.cursor += 1; self.utf16_code_units[1] |= @as(u16, c - 'a' + 10) << 4; self.state = .string_surrogate_half_backslash_u_3; continue :state_loop; }, else => return error.SyntaxError, } }, .string_surrogate_half_backslash_u_3 => { const c = try self.expectByte(); switch (c) { '0'...'9' => { self.utf16_code_units[1] |= c - '0'; }, 'A'...'F' => { self.utf16_code_units[1] |= c - 'A' + 10; }, 'a'...'f' => { self.utf16_code_units[1] |= c - 'a' + 10; }, else => return error.SyntaxError, } self.cursor += 1; self.value_start = self.cursor; self.state = .string; const code_point = std.unicode.utf16DecodeSurrogatePair(&self.utf16_code_units) catch unreachable; return partialStringCodepoint(code_point); }, .string_utf8_last_byte => { switch (try self.expectByte()) { 0x80...0xBF => { self.cursor += 1; self.state = .string; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_second_to_last_byte => { switch (try self.expectByte()) { 0x80...0xBF => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_second_to_last_byte_guard_against_overlong => { switch (try self.expectByte()) { 0xA0...0xBF => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_second_to_last_byte_guard_against_surrogate_half => { switch (try self.expectByte()) { 0x80...0x9F => { self.cursor += 1; self.state = .string_utf8_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_third_to_last_byte => { switch (try self.expectByte()) { 0x80...0xBF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_third_to_last_byte_guard_against_overlong => { switch (try self.expectByte()) { 0x90...0xBF => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .string_utf8_third_to_last_byte_guard_against_too_large => { switch (try self.expectByte()) { 0x80...0x8F => { self.cursor += 1; self.state = .string_utf8_second_to_last_byte; continue :state_loop; }, else => return error.SyntaxError, // Invalid UTF-8. } }, .literal_t => { switch (try self.expectByte()) { 'r' => { self.cursor += 1; self.state = .literal_tr; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_tr => { switch (try self.expectByte()) { 'u' => { self.cursor += 1; self.state = .literal_tru; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_tru => { switch (try self.expectByte()) { 'e' => { self.cursor += 1; self.state = .post_value; return .true; }, else => return error.SyntaxError, } }, .literal_f => { switch (try self.expectByte()) { 'a' => { self.cursor += 1; self.state = .literal_fa; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_fa => { switch (try self.expectByte()) { 'l' => { self.cursor += 1; self.state = .literal_fal; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_fal => { switch (try self.expectByte()) { 's' => { self.cursor += 1; self.state = .literal_fals; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_fals => { switch (try self.expectByte()) { 'e' => { self.cursor += 1; self.state = .post_value; return .false; }, else => return error.SyntaxError, } }, .literal_n => { switch (try self.expectByte()) { 'u' => { self.cursor += 1; self.state = .literal_nu; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_nu => { switch (try self.expectByte()) { 'l' => { self.cursor += 1; self.state = .literal_nul; continue :state_loop; }, else => return error.SyntaxError, } }, .literal_nul => { switch (try self.expectByte()) { 'l' => { self.cursor += 1; self.state = .post_value; return .null; }, else => return error.SyntaxError, } }, } unreachable; } } |
peekNextTokenType() Seeks ahead in the input until the first byte of the next token (or the end of the input) determines which type of token will be returned from the next |
pub fn peekNextTokenType(self: *@This()) PeekError!TokenType { state_loop: while (true) { switch (self.state) { .value => { switch (try self.skipWhitespaceExpectByte()) { '{' => return .object_begin, '[' => return .array_begin, '"' => return .string, '-', '0'...'9' => return .number, 't' => return .true, 'f' => return .false, 'n' => return .null, else => return error.SyntaxError, } }, .post_value => { if (try self.skipWhitespaceCheckEnd()) return .end_of_document; const c = self.input[self.cursor]; if (self.string_is_object_key) { self.string_is_object_key = false; switch (c) { ':' => { self.cursor += 1; self.state = .value; continue :state_loop; }, else => return error.SyntaxError, } } switch (c) { '}' => return .object_end, ']' => return .array_end, ',' => { switch (self.stack.peek()) { OBJECT_MODE => { self.state = .object_post_comma; }, ARRAY_MODE => { self.state = .value; }, } self.cursor += 1; continue :state_loop; }, else => return error.SyntaxError, } }, .object_start => { switch (try self.skipWhitespaceExpectByte()) { '"' => return .string, '}' => return .object_end, else => return error.SyntaxError, } }, .object_post_comma => { switch (try self.skipWhitespaceExpectByte()) { '"' => return .string, else => return error.SyntaxError, } }, .array_start => { switch (try self.skipWhitespaceExpectByte()) { ']' => return .array_end, else => { self.state = .value; continue :state_loop; }, } }, .number_minus, .number_leading_zero, .number_int, .number_post_dot, .number_frac, .number_post_e, .number_post_e_sign, .number_exp, => return .number, .string, .string_backslash, .string_backslash_u, .string_backslash_u_1, .string_backslash_u_2, .string_backslash_u_3, .string_surrogate_half, .string_surrogate_half_backslash, .string_surrogate_half_backslash_u, .string_surrogate_half_backslash_u_1, .string_surrogate_half_backslash_u_2, .string_surrogate_half_backslash_u_3, => return .string, .string_utf8_last_byte, .string_utf8_second_to_last_byte, .string_utf8_second_to_last_byte_guard_against_overlong, .string_utf8_second_to_last_byte_guard_against_surrogate_half, .string_utf8_third_to_last_byte, .string_utf8_third_to_last_byte_guard_against_overlong, .string_utf8_third_to_last_byte_guard_against_too_large, => return .string, .literal_t, .literal_tr, .literal_tru, => return .true, .literal_f, .literal_fa, .literal_fal, .literal_fals, => return .false, .literal_n, .literal_nu, .literal_nul, => return .null, } unreachable; } } const State = enum { value, post_value, object_start, object_post_comma, array_start, number_minus, number_leading_zero, number_int, number_post_dot, number_frac, number_post_e, number_post_e_sign, number_exp, string, string_backslash, string_backslash_u, string_backslash_u_1, string_backslash_u_2, string_backslash_u_3, string_surrogate_half, string_surrogate_half_backslash, string_surrogate_half_backslash_u, string_surrogate_half_backslash_u_1, string_surrogate_half_backslash_u_2, string_surrogate_half_backslash_u_3, // From http://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String string_utf8_last_byte, // State A string_utf8_second_to_last_byte, // State B string_utf8_second_to_last_byte_guard_against_overlong, // State C string_utf8_second_to_last_byte_guard_against_surrogate_half, // State D string_utf8_third_to_last_byte, // State E string_utf8_third_to_last_byte_guard_against_overlong, // State F string_utf8_third_to_last_byte_guard_against_too_large, // State G literal_t, literal_tr, literal_tru, literal_f, literal_fa, literal_fal, literal_fals, literal_n, literal_nu, literal_nul, }; fn expectByte(self: *const @This()) !u8 { if (self.cursor < self.input.len) { return self.input[self.cursor]; } // No byte. if (self.is_end_of_input) return error.UnexpectedEndOfInput; return error.BufferUnderrun; } fn skipWhitespace(self: *@This()) void { while (self.cursor < self.input.len) : (self.cursor += 1) { switch (self.input[self.cursor]) { // Whitespace ' ', '\t', '\r' => continue, '\n' => { if (self.diagnostics) |diag| { diag.line_number += 1; // This will count the newline itself, // which means a straight-forward subtraction will give a 1-based column number. diag.line_start_cursor = self.cursor; } continue; }, else => return, } } } fn skipWhitespaceExpectByte(self: *@This()) !u8 { self.skipWhitespace(); return self.expectByte(); } fn skipWhitespaceCheckEnd(self: *@This()) !bool { self.skipWhitespace(); if (self.cursor >= self.input.len) { // End of buffer. if (self.is_end_of_input) { // End of everything. if (self.stackHeight() == 0) { // We did it! return true; } return error.UnexpectedEndOfInput; } return error.BufferUnderrun; } if (self.stackHeight() == 0) return error.SyntaxError; return false; } fn takeValueSlice(self: *@This()) []const u8 { const slice = self.input[self.value_start..self.cursor]; self.value_start = self.cursor; return slice; } fn endOfBufferInNumber(self: *@This(), allow_end: bool) !Token { const slice = self.takeValueSlice(); if (self.is_end_of_input) { if (!allow_end) return error.UnexpectedEndOfInput; self.state = .post_value; return Token{ .number = slice }; } if (slice.len == 0) return error.BufferUnderrun; return Token{ .partial_number = slice }; } fn partialStringCodepoint(code_point: u21) Token { var buf: [4]u8 = undefined; switch (std.unicode.utf8Encode(code_point, &buf) catch unreachable) { 1 => return Token{ .partial_string_escaped_1 = buf[0..1].* }, 2 => return Token{ .partial_string_escaped_2 = buf[0..2].* }, 3 => return Token{ .partial_string_escaped_3 = buf[0..3].* }, 4 => return Token{ .partial_string_escaped_4 = buf[0..4].* }, else => unreachable, } } }; const OBJECT_MODE = 0; const ARRAY_MODE = 1; fn appendSlice(list: *std.ArrayList(u8), buf: []const u8, max_value_len: usize) !void { const new_len = std.math.add(usize, list.items.len, buf.len) catch return error.ValueTooLong; if (new_len > max_value_len) return error.ValueTooLong; try list.appendSlice(buf); } |
isNumberFormattedLikeAnInteger() For the slice you get from a |
pub fn isNumberFormattedLikeAnInteger(value: []const u8) bool { if (std.mem.eql(u8, value, "-0")) return false; return std.mem.indexOfAny(u8, value, ".eE") == null; } test { _ = @import("./scanner_test.zig"); } |
Generated by zstd-browse2 on 2023-11-04 14:12:19 -0400. |