zig/lib/std / ascii.zig

The 7-bit [ASCII](https://en.wikipedia.org/wiki/ASCII) character encoding standard.

This is not to be confused with the 8-bit [extended ASCII](https://en.wikipedia.org/wiki/Extended_ASCII) character encoding.

Even though this module concerns itself with 7-bit ASCII, functions use u8 as the type instead of u7 for convenience and compatibility. Characters outside of the 7-bit range are gracefully handled (e.g. by returning false).

See also: https://en.wikipedia.org/wiki/ASCII#Character_set


const std = @import("std");

control_code

The C0 control codes of the ASCII encoding.

See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and isControl

pub const control_code = struct {
    pub const nul = 0x00;
    pub const soh = 0x01;
    pub const stx = 0x02;
    pub const etx = 0x03;
    pub const eot = 0x04;
    pub const enq = 0x05;
    pub const ack = 0x06;
    pub const bel = 0x07;
    pub const bs = 0x08;
    pub const ht = 0x09;
    pub const lf = 0x0A;
    pub const vt = 0x0B;
    pub const ff = 0x0C;
    pub const cr = 0x0D;
    pub const so = 0x0E;
    pub const si = 0x0F;
    pub const dle = 0x10;
    pub const dc1 = 0x11;
    pub const dc2 = 0x12;
    pub const dc3 = 0x13;
    pub const dc4 = 0x14;
    pub const nak = 0x15;
    pub const syn = 0x16;
    pub const etb = 0x17;
    pub const can = 0x18;
    pub const em = 0x19;
    pub const sub = 0x1A;
    pub const esc = 0x1B;
    pub const fs = 0x1C;
    pub const gs = 0x1D;
    pub const rs = 0x1E;
    pub const us = 0x1F;

    pub const del = 0x7F;

    pub const xon = dc1;
    pub const xoff = dc3;
};

isAlphanumeric()

Null. Start of Heading. Start of Text. End of Text. End of Transmission. Enquiry. Acknowledge. Bell, Alert. Backspace. Horizontal Tab, Tab ('\t'). Line Feed, Newline ('\n'). Vertical Tab. Form Feed. Carriage Return ('\r'). Shift Out. Shift In. Data Link Escape. Device Control One (XON). Device Control Two. Device Control Three (XOFF). Device Control Four. Negative Acknowledge. Synchronous Idle. End of Transmission Block Cancel. End of Medium. Substitute. Escape. File Separator. Group Separator. Record Separator. Unit Separator. Delete. An alias to dc1. An alias to dc3. Returns whether the character is alphanumeric: A-Z, a-z, or 0-9.

pub fn isAlphanumeric(c: u8) bool {
    return switch (c) {
        '0'...'9', 'A'...'Z', 'a'...'z' => true,
        else => false,
    };
}

isAlphabetic()

Returns whether the character is alphabetic: A-Z or a-z.

pub fn isAlphabetic(c: u8) bool {
    return switch (c) {
        'A'...'Z', 'a'...'z' => true,
        else => false,
    };
}

isControl()

Returns whether the character is a control character.

See also: control_code

pub fn isControl(c: u8) bool {
    return c <= control_code.us or c == control_code.del;
}

isDigit()

Returns whether the character is a digit.

pub fn isDigit(c: u8) bool {
    return switch (c) {
        '0'...'9' => true,
        else => false,
    };
}

isLower()

Returns whether the character is a lowercase letter.

pub fn isLower(c: u8) bool {
    return switch (c) {
        'a'...'z' => true,
        else => false,
    };
}

isPrint()

Returns whether the character is printable and has some graphical representation, including the space character.

pub fn isPrint(c: u8) bool {
    return isASCII(c) and !isControl(c);
}

isWhitespace()

Returns whether this character is included in whitespace.

pub fn isWhitespace(c: u8) bool {
    return for (whitespace) |other| {
        if (c == other)
            break true;
    } else false;
}

whitespace

Whitespace for general use. This may be used with e.g. std.mem.trim to trim whitespace.

See also: isWhitespace

pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff };

Test:

whitespace

test "whitespace" {
    for (whitespace) |char| try std.testing.expect(isWhitespace(char));

    var i: u8 = 0;
    while (isASCII(i)) : (i += 1) {
        if (isWhitespace(i)) try std.testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null);
    }
}

isUpper()

Returns whether the character is an uppercase letter.

pub fn isUpper(c: u8) bool {
    return switch (c) {
        'A'...'Z' => true,
        else => false,
    };
}

isHex()

Returns whether the character is a hexadecimal digit: A-F, a-f, or 0-9.

pub fn isHex(c: u8) bool {
    return switch (c) {
        '0'...'9', 'A'...'F', 'a'...'f' => true,
        else => false,
    };
}

isASCII()

Returns whether the character is a 7-bit ASCII character.

pub fn isASCII(c: u8) bool {
    return c < 128;
}

toUpper()

Uppercases the character and returns it as-is if already uppercase or not a letter.

pub fn toUpper(c: u8) u8 {
    if (isLower(c)) {
        return c & 0b11011111;
    } else {
        return c;
    }
}

toLower()

Lowercases the character and returns it as-is if already lowercase or not a letter.

pub fn toLower(c: u8) u8 {
    if (isUpper(c)) {
        return c | 0b00100000;
    } else {
        return c;
    }
}

Test:

ASCII character classes

test "ASCII character classes" {
    const testing = std.testing;

    try testing.expect(!isControl('a'));
    try testing.expect(!isControl('z'));
    try testing.expect(!isControl(' '));
    try testing.expect(isControl(control_code.nul));
    try testing.expect(isControl(control_code.ff));
    try testing.expect(isControl(control_code.us));
    try testing.expect(isControl(control_code.del));
    try testing.expect(!isControl(0x80));
    try testing.expect(!isControl(0xff));

    try testing.expect('C' == toUpper('c'));
    try testing.expect(':' == toUpper(':'));
    try testing.expect('\xab' == toUpper('\xab'));
    try testing.expect(!isUpper('z'));
    try testing.expect(!isUpper(0x80));
    try testing.expect(!isUpper(0xff));

    try testing.expect('c' == toLower('C'));
    try testing.expect(':' == toLower(':'));
    try testing.expect('\xab' == toLower('\xab'));
    try testing.expect(!isLower('Z'));
    try testing.expect(!isLower(0x80));
    try testing.expect(!isLower(0xff));

    try testing.expect(isAlphanumeric('Z'));
    try testing.expect(isAlphanumeric('z'));
    try testing.expect(isAlphanumeric('5'));
    try testing.expect(isAlphanumeric('a'));
    try testing.expect(!isAlphanumeric('!'));
    try testing.expect(!isAlphanumeric(0x80));
    try testing.expect(!isAlphanumeric(0xff));

    try testing.expect(!isAlphabetic('5'));
    try testing.expect(isAlphabetic('c'));
    try testing.expect(!isAlphabetic('@'));
    try testing.expect(isAlphabetic('Z'));
    try testing.expect(!isAlphabetic(0x80));
    try testing.expect(!isAlphabetic(0xff));

    try testing.expect(isWhitespace(' '));
    try testing.expect(isWhitespace('\t'));
    try testing.expect(isWhitespace('\r'));
    try testing.expect(isWhitespace('\n'));
    try testing.expect(isWhitespace(control_code.ff));
    try testing.expect(!isWhitespace('.'));
    try testing.expect(!isWhitespace(control_code.us));
    try testing.expect(!isWhitespace(0x80));
    try testing.expect(!isWhitespace(0xff));

    try testing.expect(!isHex('g'));
    try testing.expect(isHex('b'));
    try testing.expect(isHex('F'));
    try testing.expect(isHex('9'));
    try testing.expect(!isHex(0x80));
    try testing.expect(!isHex(0xff));

    try testing.expect(!isDigit('~'));
    try testing.expect(isDigit('0'));
    try testing.expect(isDigit('9'));
    try testing.expect(!isDigit(0x80));
    try testing.expect(!isDigit(0xff));

    try testing.expect(isPrint(' '));
    try testing.expect(isPrint('@'));
    try testing.expect(isPrint('~'));
    try testing.expect(!isPrint(control_code.esc));
    try testing.expect(!isPrint(0x80));
    try testing.expect(!isPrint(0xff));
}

lowerString()

Writes a lower case copy of ascii_string to output. Asserts output.len >= ascii_string.len.

pub fn lowerString(output: []u8, ascii_string: []const u8) []u8 {
    std.debug.assert(output.len >= ascii_string.len);
    for (ascii_string, 0..) |c, i| {
        output[i] = toLower(c);
    }
    return output[0..ascii_string.len];
}

Test:

lowerString

test "lowerString" {
    var buf: [1024]u8 = undefined;
    const result = lowerString(&buf, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    try std.testing.expectEqualStrings("abcdefghijklmnopqrst0234+💩!", result);
}

allocLowerString()

Allocates a lower case copy of ascii_string. Caller owns returned string and must free with allocator.

pub fn allocLowerString(allocator: std.mem.Allocator, ascii_string: []const u8) ![]u8 {
    const result = try allocator.alloc(u8, ascii_string.len);
    return lowerString(result, ascii_string);
}

Test:

allocLowerString

test "allocLowerString" {
    const result = try allocLowerString(std.testing.allocator, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    defer std.testing.allocator.free(result);
    try std.testing.expectEqualStrings("abcdefghijklmnopqrst0234+💩!", result);
}

upperString()

Writes an upper case copy of ascii_string to output. Asserts output.len >= ascii_string.len.

pub fn upperString(output: []u8, ascii_string: []const u8) []u8 {
    std.debug.assert(output.len >= ascii_string.len);
    for (ascii_string, 0..) |c, i| {
        output[i] = toUpper(c);
    }
    return output[0..ascii_string.len];
}

Test:

upperString

test "upperString" {
    var buf: [1024]u8 = undefined;
    const result = upperString(&buf, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
}

allocUpperString()

Allocates an upper case copy of ascii_string. Caller owns returned string and must free with allocator.

pub fn allocUpperString(allocator: std.mem.Allocator, ascii_string: []const u8) ![]u8 {
    const result = try allocator.alloc(u8, ascii_string.len);
    return upperString(result, ascii_string);
}

Test:

allocUpperString

test "allocUpperString" {
    const result = try allocUpperString(std.testing.allocator, "aBcDeFgHiJkLmNOPqrst0234+💩!");
    defer std.testing.allocator.free(result);
    try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
}

eqlIgnoreCase()

Compares strings a and b case-insensitively and returns whether they are equal.

pub fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
    if (a.len != b.len) return false;
    for (a, 0..) |a_c, i| {
        if (toLower(a_c) != toLower(b[i])) return false;
    }
    return true;
}

Test:

eqlIgnoreCase

test "eqlIgnoreCase" {
    try std.testing.expect(eqlIgnoreCase("HEl💩Lo!", "hel💩lo!"));
    try std.testing.expect(!eqlIgnoreCase("hElLo!", "hello! "));
    try std.testing.expect(!eqlIgnoreCase("hElLo!", "helro!"));
}

startsWithIgnoreCase()

pub fn startsWithIgnoreCase(haystack: []const u8, needle: []const u8) bool {
    return if (needle.len > haystack.len) false else eqlIgnoreCase(haystack[0..needle.len], needle);
}

Test:

startsWithIgnoreCase

test "startsWithIgnoreCase" {
    try std.testing.expect(startsWithIgnoreCase("boB", "Bo"));
    try std.testing.expect(!startsWithIgnoreCase("Needle in hAyStAcK", "haystack"));
}

endsWithIgnoreCase()

pub fn endsWithIgnoreCase(haystack: []const u8, needle: []const u8) bool {
    return if (needle.len > haystack.len) false else eqlIgnoreCase(haystack[haystack.len - needle.len ..], needle);
}

Test:

endsWithIgnoreCase

test "endsWithIgnoreCase" {
    try std.testing.expect(endsWithIgnoreCase("Needle in HaYsTaCk", "haystack"));
    try std.testing.expect(!endsWithIgnoreCase("BoB", "Bo"));
}

indexOfIgnoreCase()

Finds needle in haystack, ignoring case, starting at index 0.

pub fn indexOfIgnoreCase(haystack: []const u8, needle: []const u8) ?usize {
    return indexOfIgnoreCasePos(haystack, 0, needle);
}

indexOfIgnoreCasePos()

Finds needle in haystack, ignoring case, starting at start_index. Uses Boyer-Moore-Horspool algorithm on large inputs; indexOfIgnoreCasePosLinear on small inputs.

pub fn indexOfIgnoreCasePos(haystack: []const u8, start_index: usize, needle: []const u8) ?usize {
    if (needle.len > haystack.len) return null;
    if (needle.len == 0) return start_index;

    if (haystack.len < 52 or needle.len <= 4)
        return indexOfIgnoreCasePosLinear(haystack, start_index, needle);

    var skip_table: [256]usize = undefined;
    boyerMooreHorspoolPreprocessIgnoreCase(needle, skip_table[0..]);

    var i: usize = start_index;
    while (i <= haystack.len - needle.len) {
        if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i;
        i += skip_table[toLower(haystack[i + needle.len - 1])];
    }

    return null;
}

indexOfIgnoreCasePosLinear()

Consider using indexOfIgnoreCasePos instead of this, which will automatically use a more sophisticated algorithm on larger inputs.

pub fn indexOfIgnoreCasePosLinear(haystack: []const u8, start_index: usize, needle: []const u8) ?usize {
    var i: usize = start_index;
    const end = haystack.len - needle.len;
    while (i <= end) : (i += 1) {
        if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i;
    }
    return null;
}

fn boyerMooreHorspoolPreprocessIgnoreCase(pattern: []const u8, table: *[256]usize) void {
    for (table) |*c| {
        c.* = pattern.len;
    }

    var i: usize = 0;
    // The last item is intentionally ignored and the skip size will be pattern.len.
    // This is the standard way Boyer-Moore-Horspool is implemented.
    while (i < pattern.len - 1) : (i += 1) {
        table[toLower(pattern[i])] = pattern.len - 1 - i;
    }
}

Test:

indexOfIgnoreCase

test "indexOfIgnoreCase" {
    try std.testing.expect(indexOfIgnoreCase("one Two Three Four", "foUr").? == 14);
    try std.testing.expect(indexOfIgnoreCase("one two three FouR", "gOur") == null);
    try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0);
    try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null);
    try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0);

    try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "ThReE fOUr").? == 8);
    try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "Two tWo") == null);
}

orderIgnoreCase()

Returns the lexicographical order of two slices. O(n).

pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
    const n = @min(lhs.len, rhs.len);
    var i: usize = 0;
    while (i < n) : (i += 1) {
        switch (std.math.order(toLower(lhs[i]), toLower(rhs[i]))) {
            .eq => continue,
            .lt => return .lt,
            .gt => return .gt,
        }
    }
    return std.math.order(lhs.len, rhs.len);
}

lessThanIgnoreCase()

Returns whether the lexicographical order of lhs is lower than rhs.

pub fn lessThanIgnoreCase(lhs: []const u8, rhs: []const u8) bool {
    return orderIgnoreCase(lhs, rhs) == .lt;
}