From c4e1ef3acca807d30fb28356565b09c4d9bd2d96 Mon Sep 17 00:00:00 2001 From: Runxi Yu Date: Wed, 17 Sep 2025 03:48:56 +0800 Subject: [PATCH] Add bloom filters --- ds/set/README | 4 ++++ ds/set/bloom/README | 1 + ds/set/bloom/add.ha | 14 ++++++++++++++ ds/set/bloom/contains.ha | 18 ++++++++++++++++++ ds/set/bloom/finish.ha | 8 ++++++++ ds/set/bloom/internal.ha | 4 ++++ ds/set/bloom/new.ha | 41 +++++++++++++++++++++++++++++++++++++++++ ds/set/bloom/set.ha | 26 ++++++++++++++++++++++++++ ds/set/bloom_fnv/README | 1 + ds/set/bloom_fnv/add.ha | 9 +++++++++ ds/set/bloom_fnv/contains.ha | 9 +++++++++ ds/set/bloom_fnv/finish.ha | 10 ++++++++++ ds/set/bloom_fnv/internal.ha | 11 +++++++++++ ds/set/bloom_fnv/new.ha | 32 ++++++++++++++++++++++++++++++++ ds/set/bloom_fnv/set.ha | 22 ++++++++++++++++++++++ ds/set/bloom_fnv/test.ha | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ ds/set/bloom_siphash/README | 1 + ds/set/bloom_siphash/add.ha | 9 +++++++++ ds/set/bloom_siphash/contains.ha | 9 +++++++++ ds/set/bloom_siphash/finish.ha | 11 +++++++++++ ds/set/bloom_siphash/internal.ha | 19 +++++++++++++++++++ ds/set/bloom_siphash/new.ha | 42 ++++++++++++++++++++++++++++++++++++++++++ ds/set/bloom_siphash/set.ha | 23 +++++++++++++++++++++++ ds/set/bloom_siphash/test.ha | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++ ds/set/set.ha | 39 +++++++++++++++++++++++++++++++++++++++ diff --git a/ds/set/README b/ds/set/README new file mode 100644 index 0000000000000000000000000000000000000000..da7771c3f7772a306ea6609800c107a72c4febe2 --- /dev/null +++ b/ds/set/README @@ -0,0 +1,4 @@ +set: general-purpose set data structures + +You should create a set with the `new` function in each submodule. +Then, you may use functions defined in this module to manipulate the set. diff --git a/ds/set/bloom/README b/ds/set/bloom/README new file mode 100644 index 0000000000000000000000000000000000000000..fbfa76d663a89e1737fb5f3c60bc6157287f2cff --- /dev/null +++ b/ds/set/bloom/README @@ -0,0 +1 @@ +bloom: Bloom filter set diff --git a/ds/set/bloom/add.ha b/ds/set/bloom/add.ha new file mode 100644 index 0000000000000000000000000000000000000000..1959a9f5e2da379382545b1ca1587848d38cf41f --- /dev/null +++ b/ds/set/bloom/add.ha @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +// Adds an item to a [[set]]. +export fn add(s: *set, key: []u8) (void | nomem) = { + let acc = s.hash64(s.hash_params, key): u64; + for (let i = 0z; i < s.k; i += 1) { + let pos = (acc % (s.m: u64)): size; + let byte = pos / 8; + let bit = pos % 8; + s.bits[byte] |= (1u8 << (bit: u8)); + acc += STEP; + }; +}; diff --git a/ds/set/bloom/contains.ha b/ds/set/bloom/contains.ha new file mode 100644 index 0000000000000000000000000000000000000000..b1bf66208abbaab41cb7a2084e4c2425a78cc855 --- /dev/null +++ b/ds/set/bloom/contains.ha @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +// Tests whether an item is present in a [[set]]. +export fn contains(s: *set, key: []u8) bool = { + let acc = s.hash64(s.hash_params, key): u64; + for (let i = 0z; i < s.k; i += 1) { + let pos = (acc % (s.m: u64)): size; + let byte = pos / 8; + let bit = pos % 8; + let mask = 1u8 << (bit: u8); + if ((s.bits[byte] & mask) == 0) { + return false; + }; + acc += STEP; + }; + return true; +}; diff --git a/ds/set/bloom/finish.ha b/ds/set/bloom/finish.ha new file mode 100644 index 0000000000000000000000000000000000000000..ddd2f1b9fbcbecf5cc547fe1ec1973c863212a6d --- /dev/null +++ b/ds/set/bloom/finish.ha @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +// Frees resources associated with a [[set]]. +export fn finish(s: *set) void = { + free(s.bits); + free(s); +}; diff --git a/ds/set/bloom/internal.ha b/ds/set/bloom/internal.ha new file mode 100644 index 0000000000000000000000000000000000000000..65f5cb17c67d826972635b829936180cd1507c8c --- /dev/null +++ b/ds/set/bloom/internal.ha @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +const STEP: u64 = 0x9E3779B97F4A7C15u64; diff --git a/ds/set/bloom/new.ha b/ds/set/bloom/new.ha new file mode 100644 index 0000000000000000000000000000000000000000..f163902520740aa7ab18d06a3e3bc408c656ed6a --- /dev/null +++ b/ds/set/bloom/new.ha @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use errors; +use ds::set; + +// Creates a new [[set]] with the given number of bits and hash functions. +// +// m controls how many bits are available in the filter. k controls how many +// hash probes are used per element. Both must be greater than zero. +export fn new( + m: size, + k: size, + hash64: *fn(hash_params: nullable *opaque, key: []u8) size, + hash_params: nullable *opaque, +) (*set | errors::invalid | nomem) = { + if (m == 0 || k == 0) { + return errors::invalid; + }; + + let nbytes = (m + 7) / 8; + let bits = match (alloc([0u8...], nbytes)) { + case let b: []u8 => yield b; + case nomem => return nomem; + }; + + let s = match (alloc(set { + vt = &_vt, + bits = bits, + m = m, + k = k, + hash64 = hash64, + hash_params = hash_params, + })) { + case let sp: *set => yield sp; + case nomem => + free(bits); + return nomem; + }; + return s; +}; diff --git a/ds/set/bloom/set.ha b/ds/set/bloom/set.ha new file mode 100644 index 0000000000000000000000000000000000000000..9ce65962707a950f4f2239610c6465dbdda00e58 --- /dev/null +++ b/ds/set/bloom/set.ha @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// A Bloom filter set from byte strings to membership bits. +// +// You are advised to create these with [[new]]. +export type set = struct { + vt: set::set, + bits: []u8, + m: size, + k: size, + hash64: *fn(hash_params: nullable *opaque, key: []u8) size, + hash_params: nullable *opaque, +}; + +const _vt: set::vtable = set::vtable { + adder = &vt_add, + tester = &vt_contains, + finisher = &vt_finish, +}; + +fn vt_add(s: *set::set, key: []u8) (void | nomem) = add(s: *set, key); +fn vt_contains(s: *set::set, key: []u8) bool = contains(s: *set, key); +fn vt_finish(s: *set::set) void = finish(s: *set); diff --git a/ds/set/bloom_fnv/README b/ds/set/bloom_fnv/README new file mode 100644 index 0000000000000000000000000000000000000000..c0385bcd966ceaf47d76d4c590b5431e368df0da --- /dev/null +++ b/ds/set/bloom_fnv/README @@ -0,0 +1 @@ +bloom_fnv: FNV Bloom filter set diff --git a/ds/set/bloom_fnv/add.ha b/ds/set/bloom_fnv/add.ha new file mode 100644 index 0000000000000000000000000000000000000000..6bd7d02b8ce529e8873be87b1bb1c0b4543024b9 --- /dev/null +++ b/ds/set/bloom_fnv/add.ha @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// Adds an item to a [[set]]. +export fn add(s: *set, key: []u8) (void | nomem) = { + return set::add(s.inner, key); +}; diff --git a/ds/set/bloom_fnv/contains.ha b/ds/set/bloom_fnv/contains.ha new file mode 100644 index 0000000000000000000000000000000000000000..0ef82a5fae069e1c000ccfeafcd117c48aa0d1dd --- /dev/null +++ b/ds/set/bloom_fnv/contains.ha @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// Tests whether an item is present in a [[set]]. +export fn contains(s: *set, key: []u8) bool = { + return set::contains(s.inner, key); +}; diff --git a/ds/set/bloom_fnv/finish.ha b/ds/set/bloom_fnv/finish.ha new file mode 100644 index 0000000000000000000000000000000000000000..2f39786b5d747e896058b3971faf49946937d184 --- /dev/null +++ b/ds/set/bloom_fnv/finish.ha @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// Frees resources associated with a [[set]]. +export fn finish(s: *set) void = { + set::finish(s.inner); + free(s); +}; diff --git a/ds/set/bloom_fnv/internal.ha b/ds/set/bloom_fnv/internal.ha new file mode 100644 index 0000000000000000000000000000000000000000..6c6a1ce430ff9ba479427e2f3f0da13ceae22241 --- /dev/null +++ b/ds/set/bloom_fnv/internal.ha @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use hash; +use hash::fnv; + +fn hash64(_params: nullable *opaque, key: []u8) size = { + let h = fnv::fnv64a(); + hash::write(&h, key); + return fnv::sum64(&h): size; +}; diff --git a/ds/set/bloom_fnv/new.ha b/ds/set/bloom_fnv/new.ha new file mode 100644 index 0000000000000000000000000000000000000000..edf900aeb419cb2cf9d49d99199d72690806b6b1 --- /dev/null +++ b/ds/set/bloom_fnv/new.ha @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use errors; +use ds::set; +use ds::set::bloom; + +// Creates a new [[set]] with the given number of bits and hash count. +export fn new( + m: size, + k: size, +) (*set | errors::invalid | nomem) = { + let inner = match (bloom::new(m, k, &hash64, null)) { + case let bs: *bloom::set => + yield (bs: *set::set); + case errors::invalid => + return errors::invalid; + case nomem => + return nomem; + }; + + let s = match (alloc(set { + vt = &_vt, + inner = inner, + })) { + case let sp: *set => yield sp; + case nomem => + set::finish(inner); + return nomem; + }; + return s; +}; diff --git a/ds/set/bloom_fnv/set.ha b/ds/set/bloom_fnv/set.ha new file mode 100644 index 0000000000000000000000000000000000000000..cb5ba0234dafe6636fb6e5006c9318f2b0e7fffa --- /dev/null +++ b/ds/set/bloom_fnv/set.ha @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// A Bloom filter set using FNV for hashing. +// +// You are advised to create these with [[new]]. +export type set = struct { + vt: set::set, + inner: *set::set, +}; + +const _vt: set::vtable = set::vtable { + adder = &vt_add, + tester = &vt_contains, + finisher = &vt_finish, +}; + +fn vt_add(s: *set::set, key: []u8) (void | nomem) = add(s: *set, key); +fn vt_contains(s: *set::set, key: []u8) bool = contains(s: *set, key); +fn vt_finish(s: *set::set) void = finish(s: *set); diff --git a/ds/set/bloom_fnv/test.ha b/ds/set/bloom_fnv/test.ha new file mode 100644 index 0000000000000000000000000000000000000000..d0420a68d2f9622a7e0aeadeb01f3712e13ae6f9 --- /dev/null +++ b/ds/set/bloom_fnv/test.ha @@ -0,0 +1,56 @@ +use ds::set; +use errors; + +fn put_le64(dst: *[8]u8, v: u64) []u8 = { + for (let i = 0z; i < 8z; i += 1) { + dst[i] = ((v >> (8u64 * (i: u64))) & 0xFFu64): u8; + }; + return dst[..]; +}; + +@test fn invalid() void = { + match (new(0, 1)) { + case errors::invalid => void; + case *set => abort("bloom_fnv: accepted m=0"); + case nomem => abort("bloom_fnv: nomem for m=0"); + }; + match (new(64, 0)) { + case errors::invalid => void; + case *set => abort("bloom_fnv: accepted k=0"); + case nomem => abort("bloom_fnv: nomem for k=0"); + }; +}; + +@test fn test() void = { + const ms: [2]size = [256z, 512z]; + const ks: [2]size = [2z, 3z]; + let buf: [8]u8 = [0...]; + const inserted: [4]u64 = [1u64, 5u64, 21u64, 45u64]; + const missing: [3]u64 = [2u64, 7u64, 88u64]; + + for (let mi = 0z; mi < len(ms); mi += 1) { + for (let ki = 0z; ki < len(ks); ki += 1) { + let s = match (new(ms[mi], ks[ki])) { + case let sp: *set => yield sp; + case errors::invalid => abort("bloom_fnv: invalid parameters"); + case nomem => abort("bloom_fnv: nomem"); + }; + defer finish(s); + let iface: *set::set = (s: *set::set); + + for (let i = 0z; i < len(inserted); i += 1) { + let key = put_le64(&buf, inserted[i]); + match (set::add(iface, key)) { + case void => void; + case nomem => abort("bloom_fnv: add nomem"); + }; + assert(set::contains(iface, key), "bloom_fnv: contains after add"); + }; + + for (let i = 0z; i < len(missing); i += 1) { + let key = put_le64(&buf, missing[i]); + assert(!set::contains(iface, key), "bloom_fnv: false positive"); + }; + }; + }; +}; diff --git a/ds/set/bloom_siphash/README b/ds/set/bloom_siphash/README new file mode 100644 index 0000000000000000000000000000000000000000..ef3fcbc867c8ab041b299955ac0a75afcad84bdd --- /dev/null +++ b/ds/set/bloom_siphash/README @@ -0,0 +1 @@ +bloom_siphash: SipHash Bloom filter set diff --git a/ds/set/bloom_siphash/add.ha b/ds/set/bloom_siphash/add.ha new file mode 100644 index 0000000000000000000000000000000000000000..6bd7d02b8ce529e8873be87b1bb1c0b4543024b9 --- /dev/null +++ b/ds/set/bloom_siphash/add.ha @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// Adds an item to a [[set]]. +export fn add(s: *set, key: []u8) (void | nomem) = { + return set::add(s.inner, key); +}; diff --git a/ds/set/bloom_siphash/contains.ha b/ds/set/bloom_siphash/contains.ha new file mode 100644 index 0000000000000000000000000000000000000000..0ef82a5fae069e1c000ccfeafcd117c48aa0d1dd --- /dev/null +++ b/ds/set/bloom_siphash/contains.ha @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// Tests whether an item is present in a [[set]]. +export fn contains(s: *set, key: []u8) bool = { + return set::contains(s.inner, key); +}; diff --git a/ds/set/bloom_siphash/finish.ha b/ds/set/bloom_siphash/finish.ha new file mode 100644 index 0000000000000000000000000000000000000000..c3187d2de640f50a5d6e62dc7ffe1e649bcbf9c0 --- /dev/null +++ b/ds/set/bloom_siphash/finish.ha @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// Frees resources associated with a [[set]]. +export fn finish(s: *set) void = { + set::finish(s.inner); + free(s.key); + free(s); +}; diff --git a/ds/set/bloom_siphash/internal.ha b/ds/set/bloom_siphash/internal.ha new file mode 100644 index 0000000000000000000000000000000000000000..c7d1ce593d71d02743a2fa99455d5a1739f4d676 --- /dev/null +++ b/ds/set/bloom_siphash/internal.ha @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use hash; +use hash::siphash; + +fn hash64(params: nullable *opaque, key: []u8) size = { + let keyptr = match (params) { + case null => + abort("ds::set::bloom_siphash: missing key"); + case let p: *opaque => + yield (p: *[16]u8); + }; + + let h = siphash::siphash(2, 4, keyptr); + defer hash::close(&h); + hash::write(&h, key); + return siphash::sum(&h): size; +}; diff --git a/ds/set/bloom_siphash/new.ha b/ds/set/bloom_siphash/new.ha new file mode 100644 index 0000000000000000000000000000000000000000..988408268fc5226cf060bba22cc1126b21914837 --- /dev/null +++ b/ds/set/bloom_siphash/new.ha @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use errors; +use ds::set; +use ds::set::bloom; + +// Creates a new [[set]] with the given number of bits, hash count, and SipHash key. +export fn new( + m: size, + k: size, + siphash_key: [16]u8, +) (*set | errors::invalid | nomem) = { + let keybox = match (alloc(siphash_key)) { + case let kp: *[16]u8 => yield kp; + case nomem => return nomem; + }; + + let inner = match (bloom::new(m, k, &hash64, (keybox: *opaque))) { + case let bs: *bloom::set => + yield (bs: *set::set); + case errors::invalid => + free(keybox); + return errors::invalid; + case nomem => + free(keybox); + return nomem; + }; + + let s = match (alloc(set { + vt = &_vt, + inner = inner, + key = keybox, + })) { + case let sp: *set => yield sp; + case nomem => + set::finish(inner); + free(keybox); + return nomem; + }; + return s; +}; diff --git a/ds/set/bloom_siphash/set.ha b/ds/set/bloom_siphash/set.ha new file mode 100644 index 0000000000000000000000000000000000000000..59874fdb1e06f1d6be67986726b182e87d80b29f --- /dev/null +++ b/ds/set/bloom_siphash/set.ha @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +use ds::set; + +// A Bloom filter set using SipHash for hashing. +// +// You are advised to create these with [[new]]. +export type set = struct { + vt: set::set, + inner: *set::set, + key: *[16]u8, +}; + +const _vt: set::vtable = set::vtable { + adder = &vt_add, + tester = &vt_contains, + finisher = &vt_finish, +}; + +fn vt_add(s: *set::set, key: []u8) (void | nomem) = add(s: *set, key); +fn vt_contains(s: *set::set, key: []u8) bool = contains(s: *set, key); +fn vt_finish(s: *set::set) void = finish(s: *set); diff --git a/ds/set/bloom_siphash/test.ha b/ds/set/bloom_siphash/test.ha new file mode 100644 index 0000000000000000000000000000000000000000..00d5787bccdf3f54627d1945fdca4ec1c60c2238 --- /dev/null +++ b/ds/set/bloom_siphash/test.ha @@ -0,0 +1,66 @@ +use crypto::random; +use ds::set; +use errors; + +fn put_le64(dst: *[8]u8, v: u64) []u8 = { + for (let i = 0z; i < 8z; i += 1) { + dst[i] = ((v >> (8u64 * (i: u64))) & 0xFFu64): u8; + }; + return dst[..]; +}; + +@test fn invalid() void = { + let key: [16]u8 = [0...]; + match (new(0, 1, key)) { + case errors::invalid => void; + case *set => abort("bloom_siphash: accepted m=0"); + case nomem => abort("bloom_siphash: nomem for m=0"); + }; + match (new(64, 0, key)) { + case errors::invalid => void; + case *set => abort("bloom_siphash: accepted k=0"); + case nomem => abort("bloom_siphash: nomem for k=0"); + }; +}; + +@test fn test() void = { + let key1: [16]u8 = [0...]; + let key2: [16]u8 = [0...]; + random::buffer(&key1); + random::buffer(&key2); + const keys: [2]*[16]u8 = [&key1, &key2]; + const ms: [2]size = [256z, 512z]; + const ks: [2]size = [2z, 3z]; + + let buf: [8]u8 = [0...]; + const inserted: [4]u64 = [4u64, 12u64, 30u64, 102u64]; + const missing: [3]u64 = [3u64, 7u64, 55u64]; + + for (let mi = 0z; mi < len(ms); mi += 1) { + for (let ki = 0z; ki < len(ks); ki += 1) { + for (let keyi = 0z; keyi < len(keys); keyi += 1) { + let s = match (new(ms[mi], ks[ki], *keys[keyi])) { + case let sp: *set => yield sp; + case errors::invalid => abort("bloom_siphash: invalid parameters"); + case nomem => abort("bloom_siphash: nomem"); + }; + defer finish(s); + let iface: *set::set = (s: *set::set); + + for (let i = 0z; i < len(inserted); i += 1) { + let key = put_le64(&buf, inserted[i]); + match (set::add(iface, key)) { + case void => void; + case nomem => abort("bloom_siphash: add nomem"); + }; + assert(set::contains(iface, key), "bloom_siphash: contains after add"); + }; + + for (let i = 0z; i < len(missing); i += 1) { + let key = put_le64(&buf, missing[i]); + assert(!set::contains(iface, key), "bloom_siphash: false positive"); + }; + }; + }; + }; +}; diff --git a/ds/set/set.ha b/ds/set/set.ha new file mode 100644 index 0000000000000000000000000000000000000000..ba95c80421a79f200809dc96085886dc44ca338b --- /dev/null +++ b/ds/set/set.ha @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MPL-2.0 +// SPDX-FileCopyrightText: 2025 Runxi Yu + +// A set is a pointer to a [[vtable]] which allows for set types to implement +// common operations. +export type set = *vtable; + +// The vtable type defines a set of virtual functions for a [[set]]. +export type vtable = struct { + adder: *adder, + tester: *tester, + finisher: *finisher, +}; + +// The interface for a set which could be used to add values. Returns void on +// success or nomem if memory allocation failed. +export type adder = fn(s: *set, key: []u8) (void | nomem); + +// Adds an item to a [[set]]. +export fn add(s: *set, key: []u8) (void | nomem) = { + return s.adder(s, key); +}; + +// The interface for a set which could be used to test membership. Returns true +// if the item may be present, false otherwise. +export type tester = fn(s: *set, key: []u8) bool; + +// Tests whether an item is present in a [[set]]. +export fn contains(s: *set, key: []u8) bool = { + return s.tester(s, key); +}; + +// The interface for a set which requires a finisher function to free it. +export type finisher = fn(s: *set) void; + +// Frees the set and all of its resources. +export fn finish(s: *set) void = { + s.finisher(s); +}; -- 2.48.1