Make Utf8CStr a first class citizen in C++ codebase

Utf8CStr is in many cases a better string view class than
std::string_view, because it provides "view" access to a string buffer
that is guaranteed to be null terminated. It also has the additional
benefit of being UTF-8 verified and can seemlessly cross FFI boundaries.

We would want to start use more Utf8CStr in our existing C++ codebase.
This commit is contained in:
topjohnwu
2025-08-25 14:53:49 -07:00
committed by John Wu
parent e2abb648ac
commit 2e4fa6864c
25 changed files with 105 additions and 124 deletions

View File

@@ -273,7 +273,7 @@ static int fmt_and_log_with_rs(LogLevel level, const char *fmt, va_list ap) {
buf[0] = '\0';
// Fortify logs when a fatal error occurs. Do not run through fortify again
int len = std::min(__call_bypassing_fortify(vsnprintf)(buf, sz, fmt, ap), sz - 1);
log_with_rs(level, rust::Utf8CStr(buf, len + 1));
log_with_rs(level, Utf8CStr(buf, len + 1));
return len;
}
@@ -415,18 +415,18 @@ string resolve_preinit_dir(const char *base_dir) {
// FFI for Utf8CStr
extern "C" void cxx$utf8str$new(rust::Utf8CStr *self, const void *s, size_t len);
extern "C" const char *cxx$utf8str$ptr(const rust::Utf8CStr *self);
extern "C" size_t cxx$utf8str$len(const rust::Utf8CStr *self);
extern "C" void cxx$utf8str$new(Utf8CStr *self, const void *s, size_t len);
extern "C" const char *cxx$utf8str$ptr(const Utf8CStr *self);
extern "C" size_t cxx$utf8str$len(const Utf8CStr *self);
rust::Utf8CStr::Utf8CStr(const char *s, size_t len) {
Utf8CStr::Utf8CStr(const char *s, size_t len) {
cxx$utf8str$new(this, s, len);
}
const char *rust::Utf8CStr::data() const {
const char *Utf8CStr::data() const {
return cxx$utf8str$ptr(this);
}
size_t rust::Utf8CStr::length() const {
size_t Utf8CStr::length() const {
return cxx$utf8str$len(this);
}

View File

@@ -369,7 +369,7 @@ impl AsRef<Utf8CStr> for Utf8CStr {
// Notice that we only implement ExternType on Utf8CStr *reference*
unsafe impl ExternType for &Utf8CStr {
type Id = type_id!("rust::Utf8CStr");
type Id = type_id!("Utf8CStr");
type Kind = cxx::kind::Trivial;
}

View File

@@ -9,7 +9,7 @@ use std::os::fd::{BorrowedFd, FromRawFd, OwnedFd, RawFd};
use cfg_if::cfg_if;
use libc::{O_RDONLY, c_char, mode_t};
use crate::ffi::{FnBoolStrStr, FnBoolString};
use crate::ffi::{FnBoolStr, FnBoolStrStr};
use crate::files::map_file_at;
pub(crate) use crate::xwrap::*;
use crate::{
@@ -183,7 +183,7 @@ pub(crate) fn parse_prop_file_rs(name: &Utf8CStr, f: &FnBoolStrStr) {
}
}
pub(crate) fn file_readline_rs(fd: RawFd, f: &FnBoolString) {
pub(crate) fn file_readline_for_cxx(fd: RawFd, f: &FnBoolStr) {
let mut fd = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
BufReader::new(fd.deref_mut()).for_each_line(|line| f.call(line));
BufReader::new(fd.deref_mut()).for_each_line(|line| f.call(Utf8CStr::from_string(line)));
}

View File

@@ -38,22 +38,18 @@ std::string full_read(const char *filename);
void write_zero(int fd, size_t size);
std::string resolve_preinit_dir(const char *base_dir);
// Functor = function<bool(string_view)>
template <typename Functor>
void file_readline(int fd, Functor &&fn) {
file_readline_rs(fd, [&](rust::String &line) -> bool {
return fn(std::string_view(line.c_str(), line.size()));
});
}
// Functor = function<bool(string_view, string_view)>
// Functor = function<bool(Utf8CStr, Utf8CStr)>
template <typename Functor>
void parse_prop_file(const char *file, Functor &&fn) {
parse_prop_file_rs(file, [&](rust::Str key, rust::Str val) -> bool {
// Null terminate all strings
// We perform the null termination here in C++ because it's very difficult to do it
// right in Rust due to pointer provenance. Trying to dereference a pointer without
// the correct provenance in Rust, even in unsafe code, is undefined behavior.
// However on the C++ side, there are fewer restrictions on pointers, so the const_cast here
// will not trigger UB in the compiler.
*(const_cast<char *>(key.data()) + key.size()) = '\0';
*(const_cast<char *>(val.data()) + val.size()) = '\0';
return fn(std::string_view(key.data(), key.size()), std::string_view(val.data(), val.size()));
return fn(Utf8CStr(key.data(), key.size() + 1), Utf8CStr(val.data(), val.size() + 1));
});
}

View File

@@ -41,7 +41,6 @@ pub mod ffi {
unsafe extern "C++" {
include!("misc.hpp");
#[namespace = "rust"]
#[cxx_name = "Utf8CStr"]
type Utf8CStrRef<'a> = &'a crate::cstr::Utf8CStr;
@@ -51,8 +50,8 @@ pub mod ffi {
type FnBoolStrStr;
fn call(self: &FnBoolStrStr, key: &str, value: &str) -> bool;
type FnBoolString;
fn call(self: &FnBoolString, key: &mut String) -> bool;
type FnBoolStr;
fn call(self: &FnBoolStr, key: Utf8CStrRef) -> bool;
}
extern "Rust" {
@@ -63,7 +62,8 @@ pub mod ffi {
fn exit_on_error(b: bool);
fn cmdline_logging();
fn parse_prop_file_rs(name: Utf8CStrRef, f: &FnBoolStrStr);
fn file_readline_rs(fd: i32, f: &FnBoolString);
#[cxx_name = "file_readline"]
fn file_readline_for_cxx(fd: i32, f: &FnBoolStr);
}
#[namespace = "rust"]

View File

@@ -136,21 +136,6 @@ int parse_int(std::string_view s);
using thread_entry = void *(*)(void *);
extern "C" int new_daemon_thread(thread_entry entry, void *arg = nullptr);
static inline bool str_contains(std::string_view s, std::string_view ss) {
return s.find(ss) != std::string::npos;
}
static inline bool str_starts(std::string_view s, std::string_view ss) {
return s.size() >= ss.size() && s.compare(0, ss.size(), ss) == 0;
}
static inline bool str_ends(std::string_view s, std::string_view ss) {
return s.size() >= ss.size() && s.compare(s.size() - ss.size(), std::string::npos, ss) == 0;
}
static inline std::string ltrim(std::string &&s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
return !std::isspace(ch);
}));
return std::move(s);
}
static inline std::string rtrim(std::string &&s) {
s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
return !std::isspace(ch) && ch != '\0';
@@ -222,8 +207,6 @@ constexpr auto operator+(T e) noexcept ->
return static_cast<std::underlying_type_t<T>>(e);
}
namespace rust {
struct Utf8CStr {
const char *data() const;
size_t length() const;
@@ -236,8 +219,9 @@ struct Utf8CStr {
const char *c_str() const { return this->data(); }
size_t size() const { return this->length(); }
bool empty() const { return this->length() == 0 ; }
operator std::string_view() const { return {data(), length()}; }
bool operator==(std::string_view rhs) const { return std::string_view{data(), length()} == rhs; }
std::string_view sv() const { return {data(), length()}; }
operator std::string_view() const { return sv(); }
bool operator==(std::string_view rhs) const { return sv() == rhs; }
private:
#pragma clang diagnostic push
@@ -246,8 +230,6 @@ private:
#pragma clang diagnostic pop
};
} // namespace rust
// Bindings for std::function to be callable from Rust
using CxxFnBoolStrStr = std::function<bool(rust::Str, rust::Str)>;
@@ -257,10 +239,10 @@ struct FnBoolStrStr : public CxxFnBoolStrStr {
return operator()(a, b);
}
};
using CxxFnBoolString = std::function<bool(rust::String&)>;
struct FnBoolString : public CxxFnBoolString {
using CxxFnBoolString::function;
bool call(rust::String &s) const {
using CxxFnBoolStr = std::function<bool(Utf8CStr)>;
struct FnBoolStr : public CxxFnBoolStr {
using CxxFnBoolStr::function;
bool call(Utf8CStr s) const {
return operator()(s);
}
};