Skip to content

Commit 4639dcb

Browse files
mertcanaltinaddaleax
authored andcommitted
src: skip duplicate UTF-8 validation in TextDecoder fatal path
Signed-off-by: Mert Can Altin <[email protected]> PR-URL: #63231 Reviewed-By: Yagiz Nizipli <[email protected]> Reviewed-By: Anna Henningsen <[email protected]> Reviewed-By: Gürgün Dayıoğlu <[email protected]> Reviewed-By: Matteo Collina <[email protected]>
1 parent 770385a commit 4639dcb

4 files changed

Lines changed: 64 additions & 8 deletions

File tree

benchmark/util/text-decoder.js

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,42 @@ const bench = common.createBenchmark(main, {
66
encoding: ['utf-8', 'windows-1252', 'iso-8859-3'],
77
ignoreBOM: [0, 1],
88
fatal: [0, 1],
9+
type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'],
10+
content: ['ascii', 'one-byte-string', 'two-byte-string'],
911
len: [256, 1024 * 16, 1024 * 128],
1012
n: [1e3],
11-
type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'],
1213
});
1314

14-
function main({ encoding, len, n, ignoreBOM, type, fatal }) {
15+
function buildContent(content, len) {
16+
let base;
17+
switch (content) {
18+
case 'ascii': base = 'a'; break;
19+
case 'one-byte-string': base = '\xff'; break;
20+
case 'two-byte-string': base = 'ğ'; break;
21+
}
22+
const unitBytes = Buffer.byteLength(base, 'utf8');
23+
const copies = Math.max(1, Math.floor(len / unitBytes));
24+
return Buffer.from(base.repeat(copies));
25+
}
26+
27+
function main({ encoding, len, n, ignoreBOM, type, fatal, content }) {
1528
const decoder = new TextDecoder(encoding, { ignoreBOM, fatal });
29+
const seed = buildContent(content, len);
1630
let buf;
1731

1832
switch (type) {
1933
case 'SharedArrayBuffer': {
20-
buf = new SharedArrayBuffer(len);
34+
buf = new SharedArrayBuffer(seed.length);
35+
new Uint8Array(buf).set(seed);
2136
break;
2237
}
2338
case 'ArrayBuffer': {
24-
buf = new ArrayBuffer(len);
39+
buf = new ArrayBuffer(seed.length);
40+
new Uint8Array(buf).set(seed);
2541
break;
2642
}
2743
case 'Buffer': {
28-
buf = Buffer.allocUnsafe(len);
44+
buf = seed;
2945
break;
3046
}
3147
}

src/encoding_binding.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -459,14 +459,15 @@ void BindingData::DecodeUTF8(const FunctionCallbackInfo<Value>& args) {
459459
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
460460
env->isolate(), "The encoded data was not valid for encoding utf-8");
461461
}
462-
463-
// TODO(chalker): save on utf8 validity recheck in StringBytes::Encode()
464462
}
465463

466464
if (length == 0) return args.GetReturnValue().SetEmptyString();
467465

468466
Local<Value> ret;
469-
if (StringBytes::Encode(env->isolate(), data, length, UTF8).ToLocal(&ret)) {
467+
v8::MaybeLocal<Value> encoded =
468+
has_fatal ? StringBytes::EncodeValidUtf8(env->isolate(), data, length)
469+
: StringBytes::Encode(env->isolate(), data, length, UTF8);
470+
if (encoded.ToLocal(&ret)) {
470471
args.GetReturnValue().Set(ret);
471472
}
472473
}

src/string_bytes.cc

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,40 @@ MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
671671
}
672672
}
673673

674+
MaybeLocal<Value> StringBytes::EncodeValidUtf8(Isolate* isolate,
675+
const char* buf,
676+
size_t buflen) {
677+
CHECK_BUFLEN_IN_RANGE(buflen);
678+
if (!buflen) return String::Empty(isolate);
679+
buflen = keep_buflen_in_range(buflen);
680+
681+
// ASCII fast path
682+
if (!simdutf::validate_ascii_with_errors(buf, buflen).error) {
683+
return ExternOneByteString::NewFromCopy(isolate, buf, buflen);
684+
}
685+
686+
if (buflen >= 32) {
687+
size_t u16size = simdutf::utf16_length_from_utf8(buf, buflen);
688+
if (u16size > static_cast<size_t>(v8::String::kMaxLength)) {
689+
isolate->ThrowException(ERR_STRING_TOO_LONG(isolate));
690+
return MaybeLocal<Value>();
691+
}
692+
return EncodeTwoByteString(
693+
isolate, u16size, [buf, buflen, u16size](uint16_t* dst) {
694+
size_t written = simdutf::convert_valid_utf8_to_utf16(
695+
buf, buflen, reinterpret_cast<char16_t*>(dst));
696+
CHECK_EQ(written, u16size);
697+
});
698+
}
699+
700+
Local<String> str;
701+
if (!String::NewFromUtf8(isolate, buf, v8::NewStringType::kNormal, buflen)
702+
.ToLocal(&str)) {
703+
isolate->ThrowException(node::ERR_STRING_TOO_LONG(isolate));
704+
}
705+
return str;
706+
}
707+
674708
MaybeLocal<Value> StringBytes::Encode(Isolate* isolate,
675709
const uint16_t* buf,
676710
size_t buflen) {

src/string_bytes.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ class StringBytes {
8383
size_t buflen,
8484
enum encoding encoding);
8585

86+
// Like Encode(..., UTF8) but does not re-validate. Input must be valid UTF-8.
87+
static v8::MaybeLocal<v8::Value> EncodeValidUtf8(v8::Isolate* isolate,
88+
const char* buf,
89+
size_t buflen);
90+
8691
// Warning: This reverses endianness on BE platforms, even though the
8792
// signature using uint16_t implies that it should not.
8893
// However, the brokenness is already public API and can't therefore

0 commit comments

Comments
 (0)