chore: use custom tokenizer (#9837)

This commit is contained in:
jrandolf 2023-03-14 10:13:23 +01:00 committed by GitHub
parent fd2f90008d
commit dd16ecae03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 370 additions and 31 deletions

15
package-lock.json generated
View File

@ -7004,12 +7004,6 @@
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
}, },
"node_modules/parsel-js": {
"version": "1.0.3",
"resolved": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e",
"dev": true,
"license": "MIT"
},
"node_modules/path-exists": { "node_modules/path-exists": {
"version": "4.0.0", "version": "4.0.0",
"license": "MIT", "license": "MIT",
@ -9242,8 +9236,7 @@
"ws": "8.12.1" "ws": "8.12.1"
}, },
"devDependencies": { "devDependencies": {
"mitt": "3.0.0", "mitt": "3.0.0"
"parsel-js": "github:jrandolf/parsel"
}, },
"engines": { "engines": {
"node": ">=14.14.0" "node": ">=14.14.0"
@ -13822,11 +13815,6 @@
"lines-and-columns": "^1.1.6" "lines-and-columns": "^1.1.6"
} }
}, },
"parsel-js": {
"version": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e",
"dev": true,
"from": "parsel-js@github:jrandolf/parsel"
},
"path-exists": { "path-exists": {
"version": "4.0.0" "version": "4.0.0"
}, },
@ -13992,7 +13980,6 @@
"extract-zip": "2.0.1", "extract-zip": "2.0.1",
"https-proxy-agent": "5.0.1", "https-proxy-agent": "5.0.1",
"mitt": "3.0.0", "mitt": "3.0.0",
"parsel-js": "github:jrandolf/parsel",
"proxy-from-env": "1.1.0", "proxy-from-env": "1.1.0",
"rimraf": "4.4.0", "rimraf": "4.4.0",
"tar-fs": "2.1.1", "tar-fs": "2.1.1",

View File

@ -152,7 +152,6 @@
} }
}, },
"devDependencies": { "devDependencies": {
"mitt": "3.0.0", "mitt": "3.0.0"
"parsel-js": "github:jrandolf/parsel"
} }
} }

View File

@ -14,7 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
import {tokenize, Tokens, TOKENS} from 'parsel-js'; import {Token, tokenize, TokenType} from './PSelectorTokenizer.js';
export type CSSSelector = string; export type CSSSelector = string;
export type PPseudoSelector = { export type PPseudoSelector = {
@ -29,13 +29,8 @@ export type CompoundPSelector = Array<CSSSelector | PPseudoSelector>;
export type ComplexPSelector = Array<CompoundPSelector | PCombinator>; export type ComplexPSelector = Array<CompoundPSelector | PCombinator>;
export type ComplexPSelectorList = ComplexPSelector[]; export type ComplexPSelectorList = ComplexPSelector[];
TOKENS['combinator'] = new RegExp(
`${/\s*(?:>{3,4})\s*|/.source}${TOKENS['combinator']!.source}`,
'g'
);
class TokenSpan { class TokenSpan {
#tokens: Tokens[] = []; #tokens: Token[] = [];
#selector: string; #selector: string;
constructor(selector: string) { constructor(selector: string) {
@ -46,13 +41,13 @@ class TokenSpan {
return this.#tokens.length; return this.#tokens.length;
} }
add(token: Tokens) { add(token: Token) {
this.#tokens.push(token); this.#tokens.push(token);
} }
toStringAndClear() { toStringAndClear() {
const startToken = this.#tokens[0] as Tokens; const startToken = this.#tokens[0] as Token;
const endToken = this.#tokens[this.#tokens.length - 1] as Tokens; const endToken = this.#tokens[this.#tokens.length - 1] as Token;
this.#tokens.splice(0); this.#tokens.splice(0);
return this.#selector.slice(startToken.pos[0], endToken.pos[1]); return this.#selector.slice(startToken.pos[0], endToken.pos[1]);
} }
@ -89,9 +84,9 @@ export function parsePSelectors(
const storage = new TokenSpan(selector); const storage = new TokenSpan(selector);
for (const token of tokens) { for (const token of tokens) {
switch (token.type) { switch (token.type) {
case 'combinator': case TokenType.Combinator:
switch (token.content) { switch (token.content) {
case '>>>': case PCombinator.Descendent:
isPureCSS = false; isPureCSS = false;
if (storage.length) { if (storage.length) {
compoundSelector.push(storage.toStringAndClear()); compoundSelector.push(storage.toStringAndClear());
@ -100,7 +95,7 @@ export function parsePSelectors(
complexSelector.push(PCombinator.Descendent); complexSelector.push(PCombinator.Descendent);
complexSelector.push(compoundSelector); complexSelector.push(compoundSelector);
continue; continue;
case '>>>>': case PCombinator.Child:
isPureCSS = false; isPureCSS = false;
if (storage.length) { if (storage.length) {
compoundSelector.push(storage.toStringAndClear()); compoundSelector.push(storage.toStringAndClear());
@ -111,7 +106,7 @@ export function parsePSelectors(
continue; continue;
} }
break; break;
case 'pseudo-element': case TokenType.PseudoElement:
if (!token.name.startsWith('-p-')) { if (!token.name.startsWith('-p-')) {
break; break;
} }
@ -124,7 +119,7 @@ export function parsePSelectors(
value: unquote(token.argument ?? ''), value: unquote(token.argument ?? ''),
}); });
continue; continue;
case 'comma': case TokenType.Comma:
if (storage.length) { if (storage.length) {
compoundSelector.push(storage.toStringAndClear()); compoundSelector.push(storage.toStringAndClear());
} }

View File

@ -0,0 +1,272 @@
/**
* Copyright (c) 2020 Lea Verou
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
import {assert} from '../util/assert.js';
export const enum TokenType {
Class = 'class',
Attribute = 'attribute',
Id = 'id',
Type = 'type',
Universal = 'universal',
PseudoElement = 'pseudo-element',
PseudoClass = 'pseudo-class',
Comma = 'comma',
Combinator = 'combinator',
}
export interface Token {
type: string;
content: string;
name: string;
namespace?: string;
value?: string;
pos: [number, number];
operator?: string;
argument?: string;
caseSensitive?: 'i';
/**
* @internal
*/
__changed?: boolean;
}
const TOKENS: Record<string, RegExp> = {
[TokenType.Attribute]:
/\[\s*(?:(?<namespace>(?:\\.|[-\w\P{ASCII}])+|\*)?\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)\s*(?:(?<operator>\W?=)\s*(?<value>.+?)\s*(\s(?<caseSensitive>[iIsS]))?\s*)?\]/gu,
[TokenType.Id]: /#(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
[TokenType.Class]: /\.(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
[TokenType.Comma]: /\s*,\s*/g,
[TokenType.Combinator]: /\s*(?:>{3,4}|[\s>+~])\s*/g,
[TokenType.PseudoElement]:
/::(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>+)\))?/gu,
[TokenType.PseudoClass]:
/:(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>+)\))?/gu,
[TokenType.Universal]: /(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?\*/gu,
[TokenType.Type]:
/(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
};
const getArgumentPatternByType = (type: string) => {
switch (type) {
case TokenType.PseudoElement:
case TokenType.PseudoClass:
return new RegExp(
TOKENS[type]!.source.replace('(?<argument>¶+)', '(?<argument>.+)'),
'gu'
);
default:
return TOKENS[type];
}
};
function assertTokenArray(
tokens: Array<Token | string>
): asserts tokens is Token[] {
let offset = 0;
for (const token of tokens) {
switch (typeof token) {
case 'string':
throw new Error(
`Unexpected sequence ${token} found at index ${offset}`
);
case 'object':
offset += token.content.length;
token.pos = [offset - token.content.length, offset];
switch (token.type) {
case TokenType.Combinator:
case TokenType.Comma:
token.content = token.content.trim() || ' ';
break;
}
break;
}
}
}
export function tokenize(selector: string, grammar = TOKENS): Token[] {
if (!selector) {
return [];
}
selector = selector.trim();
type Replacement = {value: string; offset: number};
const replacements: Replacement[] = [];
// Replace strings with placeholder
{
interface State {
escaped: boolean;
quoted?: string;
offset: number;
}
const state: State = {escaped: false, offset: 0};
for (let i = 0; i < selector.length; ++i) {
if (state.escaped) {
continue;
}
switch (selector[i]) {
case '\\':
state.escaped = true;
break;
case '"':
case "'": {
if (!state.quoted) {
state.quoted = selector[i];
state.offset = i;
continue;
}
const quote = state.quoted;
if (quote !== selector[i]) {
continue;
}
delete state.quoted;
const offset = state.offset;
const value = selector.slice(state.offset, i + 1);
replacements.push({value, offset});
const replacement = `${quote}${'§'.repeat(value.length - 2)}${quote}`;
selector =
selector.slice(0, offset) +
replacement +
selector.slice(offset + value.length);
break;
}
}
}
}
// Replace parentheses with placeholder
{
interface State {
escaped: boolean;
nesting: number;
offset: number;
}
const state: State = {escaped: false, nesting: 0, offset: 0};
for (let i = 0; i < selector.length; ++i) {
if (state.escaped) {
continue;
}
switch (selector[i]) {
case '\\':
state.escaped = true;
break;
case '(':
if (++state.nesting !== 1) {
continue;
}
state.offset = i;
break;
case ')': {
if (--state.nesting !== 0) {
continue;
}
const {offset} = state;
const value = selector.slice(offset, i + 1);
replacements.push({value, offset});
const replacement = `(${'¶'.repeat(value.length - 2)})`;
selector =
selector.slice(0, offset) +
replacement +
selector.slice(offset + value.length);
break;
}
}
}
}
// Our goal here is basically try each token type on the selector, keeping
// track of order. Hopefully by the end, we have an array of tokens.
const tokens: Array<Token | string> = [selector];
for (const [type, pattern] of Object.entries(grammar)) {
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i];
if (typeof token !== 'string') {
continue;
}
pattern.lastIndex = 0;
const match = pattern.exec(token);
if (!match) {
continue;
}
const from = match.index - 1;
const args: Array<Token | string> = [];
const content = match[0];
const before = token.slice(0, from + 1);
if (before) {
args.push(before);
}
args.push({
...(match.groups as unknown as Token),
type,
content,
});
const after = token.slice(from + content.length + 1);
if (after) {
args.push(after);
}
tokens.splice(i, 1, ...args);
}
}
assertTokenArray(tokens);
// Replace placeholders in reverse order.
for (const replacement of replacements.reverse()) {
for (const token of tokens) {
const {offset, value} = replacement;
if (!(token.pos[0] <= offset && offset + value.length <= token.pos[1])) {
continue;
}
const {content} = token;
const tokenOffset = offset - token.pos[0];
token.content =
content.slice(0, tokenOffset) +
value +
content.slice(tokenOffset + value.length);
token.__changed = token.content !== content;
}
}
// Rematch tokens with changed content.
for (const token of tokens) {
if (!token.__changed) {
continue;
}
delete token.__changed;
const pattern = getArgumentPatternByType(token.type);
assert(pattern);
pattern.lastIndex = 0;
const match = pattern.exec(token.content);
assert(match);
Object.assign(token, match.groups);
}
return tokens;
}

View File

@ -0,0 +1,86 @@
/**
* Copyright 2023 Google Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import expect from 'expect';
import {tokenize} from 'puppeteer-core/internal/injected/PSelectorTokenizer.js';
describe('PSelectorTokenizer', () => {
it('should work', () => {
expect(JSON.stringify(tokenize('#foo'))).toStrictEqual(
'[{"name":"foo","type":"id","content":"#foo","pos":[0,4]}]'
);
});
it('should work with empty selectors', () => {
expect(JSON.stringify(tokenize(''))).toStrictEqual('[]');
});
it('should work with multiple strings', () => {
expect(
JSON.stringify(
tokenize('[data-test-id^="test-"]:not([data-test-id^="test-foo"])')
)
).toStrictEqual(
'[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]}]'
);
});
it('should work with multiple parentheses', () => {
expect(
JSON.stringify(
tokenize(
'[data-test-id^="test-"]:not([data-test-id^="test-foo"]) [data-test-id^="test-"]:not([data-test-id^="test-foo"])'
)
)
).toStrictEqual(
'[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]},{"type":"combinator","content":" ","pos":[55,56]},{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[56,79]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[79,111]}]'
);
});
it('should work with CSS escapes', () => {
expect(
JSON.stringify(tokenize('.mb-\\[max\\(-70\\%\\2c -23rem\\)\\]'))
).toStrictEqual(
'[{"name":"mb-\\\\[max\\\\(-70\\\\%\\\\2c","type":"class","content":".mb-\\\\[max\\\\(-70\\\\%\\\\2c","pos":[0,19]},{"type":"combinator","content":" ","pos":[19,20]},{"name":"-23rem\\\\)\\\\]","type":"type","content":"-23rem\\\\)\\\\]","pos":[20,30]}]'
);
});
it('should work with complex selectors', () => {
expect(
JSON.stringify(tokenize('a > b, c ~ d, a+b, e ::before ::after(a)'))
).toStrictEqual(
'[{"name":"a","type":"type","content":"a","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"name":"b","type":"type","content":"b","pos":[4,5]},{"type":"comma","content":",","pos":[5,7]},{"name":"c","type":"type","content":"c","pos":[7,8]},{"type":"combinator","content":"~","pos":[8,11]},{"name":"d","type":"type","content":"d","pos":[11,12]},{"type":"comma","content":",","pos":[12,14]},{"name":"a","type":"type","content":"a","pos":[14,15]},{"type":"combinator","content":"+","pos":[15,16]},{"name":"b","type":"type","content":"b","pos":[16,17]},{"type":"comma","content":",","pos":[17,19]},{"name":"e","type":"type","content":"e","pos":[19,20]},{"type":"combinator","content":" ","pos":[20,21]},{"name":"before","type":"pseudo-element","content":"::before","pos":[21,29]},{"type":"combinator","content":" ","pos":[29,30]},{"name":"after","argument":"a","type":"pseudo-element","content":"::after(a)","pos":[30,40]}]'
);
});
it('should throw with invalid selectors', () => {
expect(() => {
tokenize('a[b');
}).toThrow();
expect(() => {
tokenize('a(b');
}).toThrow();
expect(() => {
tokenize('[');
}).toThrow();
});
it('should work with universal selectors', () => {
expect(JSON.stringify(tokenize('* > *'))).toStrictEqual(
'[{"type":"universal","content":"*","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"type":"universal","content":"*","pos":[4,5]}]'
);
});
});