Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
url: enforce valid UTF-8 in WHATWG parser
This commit implements the Web IDL USVString conversion, which mandates
all unpaired Unicode surrogates be turned into U+FFFD REPLACEMENT
CHARACTER. It also disallows Symbols to be used as USVString per spec.

Certain functions call into C++ methods in the binding that use the
Utf8Value class to access string arguments. Utf8Value already does the
normalization using V8's String::Write, so in those cases, instead of
doing the full USVString normalization, only a symbol check is done
(`'' + val`, which uses ES's ToString, versus `String()` which has
special provisions for symbols).
  • Loading branch information
TimothyGu committed Feb 28, 2017
commit aa4b2bea2da31981fa69c31082dcd34bc8b02f3d
98 changes: 65 additions & 33 deletions lib/internal/url.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ const IteratorPrototype = Object.getPrototypeOf(
Object.getPrototypeOf([][Symbol.iterator]())
);

const unpairedSurrogateRe =
/([^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/;
function toUSVString(val) {
const str = '' + val;
// As of V8 5.5, `str.search()` (and `unpairedSurrogateRe[@@search]()`) are
// slower than `unpairedSurrogateRe.exec()`.
const match = unpairedSurrogateRe.exec(str);
if (!match)
return str;
return binding.toUSVString(str, match.index);
}

class OpaqueOrigin {
toString() {
return 'null';
Expand Down Expand Up @@ -104,7 +116,6 @@ function onParseComplete(flags, protocol, username, password,

// Reused by URL constructor and URL#href setter.
function parse(url, input, base) {
input = String(input);
const base_context = base ? base[context] : undefined;
url[context] = new StorageObject();
binding.parse(input.trim(), -1,
Expand Down Expand Up @@ -206,8 +217,10 @@ function onParseHashComplete(flags, protocol, username, password,

class URL {
constructor(input, base) {
// toUSVString is not needed.
input = '' + input;
if (base !== undefined && !(base instanceof URL))
base = new URL(String(base));
base = new URL(base);
parse(this, input, base);
}

Expand Down Expand Up @@ -315,6 +328,8 @@ Object.defineProperties(URL.prototype, {
return this[kFormat]({});
},
set(input) {
// toUSVString is not needed.
input = '' + input;
parse(this, input);
}
},
Expand All @@ -332,7 +347,8 @@ Object.defineProperties(URL.prototype, {
return this[context].scheme;
},
set(scheme) {
scheme = String(scheme);
// toUSVString is not needed.
scheme = '' + scheme;
if (scheme.length === 0)
return;
binding.parse(scheme, binding.kSchemeStart, null, this[context],
Expand All @@ -346,7 +362,8 @@ Object.defineProperties(URL.prototype, {
return this[context].username || '';
},
set(username) {
username = String(username);
// toUSVString is not needed.
username = '' + username;
if (!this.hostname)
return;
const ctx = this[context];
Expand All @@ -366,7 +383,8 @@ Object.defineProperties(URL.prototype, {
return this[context].password || '';
},
set(password) {
password = String(password);
// toUSVString is not needed.
password = '' + password;
if (!this.hostname)
return;
const ctx = this[context];
Expand All @@ -391,7 +409,8 @@ Object.defineProperties(URL.prototype, {
},
set(host) {
const ctx = this[context];
host = String(host);
// toUSVString is not needed.
host = '' + host;
if (this[cannotBeBase] ||
(this[special] && host.length === 0)) {
// Cannot set the host if cannot-be-base is set or
Expand All @@ -415,7 +434,8 @@ Object.defineProperties(URL.prototype, {
},
set(host) {
const ctx = this[context];
host = String(host);
// toUSVString is not needed.
host = '' + host;
if (this[cannotBeBase] ||
(this[special] && host.length === 0)) {
// Cannot set the host if cannot-be-base is set or
Expand All @@ -439,11 +459,12 @@ Object.defineProperties(URL.prototype, {
return port === undefined ? '' : String(port);
},
set(port) {
// toUSVString is not needed.
port = '' + port;
const ctx = this[context];
if (!ctx.host || this[cannotBeBase] ||
this.protocol === 'file:')
return;
port = String(port);
if (port === '') {
ctx.port = undefined;
return;
Expand All @@ -462,9 +483,11 @@ Object.defineProperties(URL.prototype, {
return ctx.path !== undefined ? `/${ctx.path.join('/')}` : '';
},
set(path) {
// toUSVString is not needed.
path = '' + path;
if (this[cannotBeBase])
return;
binding.parse(String(path), binding.kPathStart, null, this[context],
binding.parse(path, binding.kPathStart, null, this[context],
onParsePathComplete.bind(this));
}
},
Expand All @@ -477,7 +500,7 @@ Object.defineProperties(URL.prototype, {
},
set(search) {
const ctx = this[context];
search = String(search);
search = toUSVString(search);
if (!search) {
ctx.query = null;
ctx.flags &= ~binding.URL_FLAGS_HAS_QUERY;
Expand Down Expand Up @@ -509,7 +532,8 @@ Object.defineProperties(URL.prototype, {
},
set(hash) {
const ctx = this[context];
hash = String(hash);
// toUSVString is not needed.
hash = '' + hash;
if (this.protocol === 'javascript:')
return;
if (!hash) {
Expand Down Expand Up @@ -652,19 +676,22 @@ class URLSearchParams {
if (pair.length !== 2) {
throw new TypeError('Each query pair must be a name/value tuple');
}
this[searchParams].push(String(pair[0]), String(pair[1]));
const key = toUSVString(pair[0]);
const value = toUSVString(pair[1]);
this[searchParams].push(key, value);
}
} else {
// record<USVString, USVString>
this[searchParams] = [];
for (const key of Object.keys(init)) {
const value = String(init[key]);
for (var key of Object.keys(init)) {
key = toUSVString(key);
const value = toUSVString(init[key]);
this[searchParams].push(key, value);
}
}
} else {
// USVString
init = String(init);
init = toUSVString(init);
if (init[0] === '?') init = init.slice(1);
initSearchParams(this, init);
}
Expand Down Expand Up @@ -743,8 +770,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
throw new TypeError('"name" and "value" arguments must be specified');
}

name = String(name);
value = String(value);
name = toUSVString(name);
value = toUSVString(value);
this[searchParams].push(name, value);
update(this[context], this);
},
Expand All @@ -758,7 +785,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length;) {
const cur = list[i];
if (cur === name) {
Expand All @@ -779,7 +806,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
return list[i + 1];
Expand All @@ -798,7 +825,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {

const list = this[searchParams];
const values = [];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
values.push(list[i + 1]);
Expand All @@ -816,7 +843,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
return true;
Expand All @@ -834,8 +861,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
value = String(value);
name = toUSVString(name);
value = toUSVString(value);

// If there are any name-value pairs whose name is `name`, in `list`, set
// the value of the first such name-value pair to `value` and remove the
Expand Down Expand Up @@ -1098,11 +1125,13 @@ function originFor(url, base) {
}

function domainToASCII(domain) {
return binding.domainToASCII(String(domain));
// toUSVString is not needed.
return binding.domainToASCII('' + domain);
}

function domainToUnicode(domain) {
return binding.domainToUnicode(String(domain));
// toUSVString is not needed.
return binding.domainToUnicode('' + domain);
}

// Utility function that converts a URL object into an ordinary
Expand Down Expand Up @@ -1188,11 +1217,14 @@ function getPathFromURL(path) {
return isWindows ? getPathFromURLWin32(path) : getPathFromURLPosix(path);
}

exports.getPathFromURL = getPathFromURL;
exports.URL = URL;
exports.URLSearchParams = URLSearchParams;
exports.domainToASCII = domainToASCII;
exports.domainToUnicode = domainToUnicode;
exports.urlToOptions = urlToOptions;
exports.formatSymbol = kFormat;
exports.searchParamsSymbol = searchParams;
module.exports = {
toUSVString,
getPathFromURL,
URL,
URLSearchParams,
domainToASCII,
domainToUnicode,
urlToOptions,
formatSymbol: kFormat,
searchParamsSymbol: searchParams
};
53 changes: 53 additions & 0 deletions src/node_url.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <unicode/utf.h>
#endif

#define UNICODE_REPLACEMENT_CHARACTER 0xFFFD

namespace node {

using v8::Array;
Expand Down Expand Up @@ -143,6 +145,21 @@ namespace url {
}
#endif

// If a UTF-16 character is a low/trailing surrogate.
static inline bool IsUnicodeTrail(uint16_t c) {
return (c & 0xFC00) == 0xDC00;
}

// If a UTF-16 character is a surrogate.
static inline bool IsUnicodeSurrogate(uint16_t c) {
return (c & 0xF800) == 0xD800;
}

// If a UTF-16 surrogate is a low/trailing one.
static inline bool IsUnicodeSurrogateTrail(uint16_t c) {
return (c & 0x400) != 0;
}

static url_host_type ParseIPv6Host(url_host* host,
const char* input,
size_t length) {
Expand Down Expand Up @@ -1351,6 +1368,41 @@ namespace url {
v8::NewStringType::kNormal).ToLocalChecked());
}

static void ToUSVString(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK_EQ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the existing functions use CHECK_GE, and I don't see a reason to be stricter than what this function actually uses.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah there are functions in other files doing EQ...not sure if we have a convention or not, just think GE is implying there could be more args, which doesn't seem to be the case for this one, though I don't feel very strongly about this.

CHECK(args[0]->IsString());
CHECK(args[1]->IsNumber());

TwoByteValue value(env->isolate(), args[0]);
const size_t n = value.length();

const int64_t start = args[1]->IntegerValue(env->context()).FromJust();
CHECK_GE(start, 0);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe another CHECK_LT(start, n)? Doesn't do any harm even if start is larger though.

Copy link
Member Author

@TimothyGu TimothyGu Feb 25, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm only checking start >= 0 because I'm converting start to a size_t, which is an unsigned type. In C++, signed-to-unsigned conversion, though a defined operation, acts oddly when the signed value is negative. On the other hand, n >= start is a more benign case, and I don't think requires the full effects of a runtime assertion (i.e. crashing).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yep, not worth a full on abort ;)


for (size_t i = start; i < n; i++) {
uint16_t c = value[i];
if (!IsUnicodeSurrogate(c)) {
continue;
} else if (IsUnicodeSurrogateTrail(c) || i == n - 1) {
value[i] = UNICODE_REPLACEMENT_CHARACTER;
} else {
uint16_t d = value[i + 1];
if (IsUnicodeTrail(d)) {
i++;
} else {
value[i] = UNICODE_REPLACEMENT_CHARACTER;
}
}
}

args.GetReturnValue().Set(
String::NewFromTwoByte(env->isolate(),
*value,
v8::NewStringType::kNormal,
n).ToLocalChecked());
}

static void DomainToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
Expand Down Expand Up @@ -1398,6 +1450,7 @@ namespace url {
Environment* env = Environment::GetCurrent(context);
env->SetMethod(target, "parse", Parse);
env->SetMethod(target, "encodeAuth", EncodeAuthSet);
env->SetMethod(target, "toUSVString", ToUSVString);
env->SetMethod(target, "domainToASCII", DomainToASCII);
env->SetMethod(target, "domainToUnicode", DomainToUnicode);

Expand Down
Loading