chore: use custom tokenizer (#9837)
This commit is contained in:
parent
fd2f90008d
commit
dd16ecae03
15
package-lock.json
generated
15
package-lock.json
generated
@ -7004,12 +7004,6 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/parsel-js": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/path-exists": {
|
||||
"version": "4.0.0",
|
||||
"license": "MIT",
|
||||
@ -9242,8 +9236,7 @@
|
||||
"ws": "8.12.1"
|
||||
},
|
||||
"devDependencies": {
|
||||
"mitt": "3.0.0",
|
||||
"parsel-js": "github:jrandolf/parsel"
|
||||
"mitt": "3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.14.0"
|
||||
@ -13822,11 +13815,6 @@
|
||||
"lines-and-columns": "^1.1.6"
|
||||
}
|
||||
},
|
||||
"parsel-js": {
|
||||
"version": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e",
|
||||
"dev": true,
|
||||
"from": "parsel-js@github:jrandolf/parsel"
|
||||
},
|
||||
"path-exists": {
|
||||
"version": "4.0.0"
|
||||
},
|
||||
@ -13992,7 +13980,6 @@
|
||||
"extract-zip": "2.0.1",
|
||||
"https-proxy-agent": "5.0.1",
|
||||
"mitt": "3.0.0",
|
||||
"parsel-js": "github:jrandolf/parsel",
|
||||
"proxy-from-env": "1.1.0",
|
||||
"rimraf": "4.4.0",
|
||||
"tar-fs": "2.1.1",
|
||||
|
@ -152,7 +152,6 @@
|
||||
}
|
||||
},
|
||||
"devDependencies": {
|
||||
"mitt": "3.0.0",
|
||||
"parsel-js": "github:jrandolf/parsel"
|
||||
"mitt": "3.0.0"
|
||||
}
|
||||
}
|
||||
|
@ -14,7 +14,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import {tokenize, Tokens, TOKENS} from 'parsel-js';
|
||||
import {Token, tokenize, TokenType} from './PSelectorTokenizer.js';
|
||||
|
||||
export type CSSSelector = string;
|
||||
export type PPseudoSelector = {
|
||||
@ -29,13 +29,8 @@ export type CompoundPSelector = Array<CSSSelector | PPseudoSelector>;
|
||||
export type ComplexPSelector = Array<CompoundPSelector | PCombinator>;
|
||||
export type ComplexPSelectorList = ComplexPSelector[];
|
||||
|
||||
TOKENS['combinator'] = new RegExp(
|
||||
`${/\s*(?:>{3,4})\s*|/.source}${TOKENS['combinator']!.source}`,
|
||||
'g'
|
||||
);
|
||||
|
||||
class TokenSpan {
|
||||
#tokens: Tokens[] = [];
|
||||
#tokens: Token[] = [];
|
||||
#selector: string;
|
||||
|
||||
constructor(selector: string) {
|
||||
@ -46,13 +41,13 @@ class TokenSpan {
|
||||
return this.#tokens.length;
|
||||
}
|
||||
|
||||
add(token: Tokens) {
|
||||
add(token: Token) {
|
||||
this.#tokens.push(token);
|
||||
}
|
||||
|
||||
toStringAndClear() {
|
||||
const startToken = this.#tokens[0] as Tokens;
|
||||
const endToken = this.#tokens[this.#tokens.length - 1] as Tokens;
|
||||
const startToken = this.#tokens[0] as Token;
|
||||
const endToken = this.#tokens[this.#tokens.length - 1] as Token;
|
||||
this.#tokens.splice(0);
|
||||
return this.#selector.slice(startToken.pos[0], endToken.pos[1]);
|
||||
}
|
||||
@ -89,9 +84,9 @@ export function parsePSelectors(
|
||||
const storage = new TokenSpan(selector);
|
||||
for (const token of tokens) {
|
||||
switch (token.type) {
|
||||
case 'combinator':
|
||||
case TokenType.Combinator:
|
||||
switch (token.content) {
|
||||
case '>>>':
|
||||
case PCombinator.Descendent:
|
||||
isPureCSS = false;
|
||||
if (storage.length) {
|
||||
compoundSelector.push(storage.toStringAndClear());
|
||||
@ -100,7 +95,7 @@ export function parsePSelectors(
|
||||
complexSelector.push(PCombinator.Descendent);
|
||||
complexSelector.push(compoundSelector);
|
||||
continue;
|
||||
case '>>>>':
|
||||
case PCombinator.Child:
|
||||
isPureCSS = false;
|
||||
if (storage.length) {
|
||||
compoundSelector.push(storage.toStringAndClear());
|
||||
@ -111,7 +106,7 @@ export function parsePSelectors(
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case 'pseudo-element':
|
||||
case TokenType.PseudoElement:
|
||||
if (!token.name.startsWith('-p-')) {
|
||||
break;
|
||||
}
|
||||
@ -124,7 +119,7 @@ export function parsePSelectors(
|
||||
value: unquote(token.argument ?? ''),
|
||||
});
|
||||
continue;
|
||||
case 'comma':
|
||||
case TokenType.Comma:
|
||||
if (storage.length) {
|
||||
compoundSelector.push(storage.toStringAndClear());
|
||||
}
|
||||
|
272
packages/puppeteer-core/src/injected/PSelectorTokenizer.ts
Normal file
272
packages/puppeteer-core/src/injected/PSelectorTokenizer.ts
Normal file
@ -0,0 +1,272 @@
|
||||
/**
|
||||
* Copyright (c) 2020 Lea Verou
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
import {assert} from '../util/assert.js';
|
||||
|
||||
export const enum TokenType {
|
||||
Class = 'class',
|
||||
Attribute = 'attribute',
|
||||
Id = 'id',
|
||||
Type = 'type',
|
||||
Universal = 'universal',
|
||||
PseudoElement = 'pseudo-element',
|
||||
PseudoClass = 'pseudo-class',
|
||||
Comma = 'comma',
|
||||
Combinator = 'combinator',
|
||||
}
|
||||
|
||||
export interface Token {
|
||||
type: string;
|
||||
content: string;
|
||||
name: string;
|
||||
namespace?: string;
|
||||
value?: string;
|
||||
pos: [number, number];
|
||||
operator?: string;
|
||||
argument?: string;
|
||||
caseSensitive?: 'i';
|
||||
/**
|
||||
* @internal
|
||||
*/
|
||||
__changed?: boolean;
|
||||
}
|
||||
|
||||
const TOKENS: Record<string, RegExp> = {
|
||||
[TokenType.Attribute]:
|
||||
/\[\s*(?:(?<namespace>(?:\\.|[-\w\P{ASCII}])+|\*)?\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)\s*(?:(?<operator>\W?=)\s*(?<value>.+?)\s*(\s(?<caseSensitive>[iIsS]))?\s*)?\]/gu,
|
||||
[TokenType.Id]: /#(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
|
||||
[TokenType.Class]: /\.(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
|
||||
[TokenType.Comma]: /\s*,\s*/g,
|
||||
[TokenType.Combinator]: /\s*(?:>{3,4}|[\s>+~])\s*/g,
|
||||
[TokenType.PseudoElement]:
|
||||
/::(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>¶+)\))?/gu,
|
||||
[TokenType.PseudoClass]:
|
||||
/:(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>¶+)\))?/gu,
|
||||
[TokenType.Universal]: /(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?\*/gu,
|
||||
[TokenType.Type]:
|
||||
/(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
|
||||
};
|
||||
|
||||
const getArgumentPatternByType = (type: string) => {
|
||||
switch (type) {
|
||||
case TokenType.PseudoElement:
|
||||
case TokenType.PseudoClass:
|
||||
return new RegExp(
|
||||
TOKENS[type]!.source.replace('(?<argument>¶+)', '(?<argument>.+)'),
|
||||
'gu'
|
||||
);
|
||||
default:
|
||||
return TOKENS[type];
|
||||
}
|
||||
};
|
||||
|
||||
function assertTokenArray(
|
||||
tokens: Array<Token | string>
|
||||
): asserts tokens is Token[] {
|
||||
let offset = 0;
|
||||
for (const token of tokens) {
|
||||
switch (typeof token) {
|
||||
case 'string':
|
||||
throw new Error(
|
||||
`Unexpected sequence ${token} found at index ${offset}`
|
||||
);
|
||||
case 'object':
|
||||
offset += token.content.length;
|
||||
token.pos = [offset - token.content.length, offset];
|
||||
switch (token.type) {
|
||||
case TokenType.Combinator:
|
||||
case TokenType.Comma:
|
||||
token.content = token.content.trim() || ' ';
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function tokenize(selector: string, grammar = TOKENS): Token[] {
|
||||
if (!selector) {
|
||||
return [];
|
||||
}
|
||||
selector = selector.trim();
|
||||
|
||||
type Replacement = {value: string; offset: number};
|
||||
const replacements: Replacement[] = [];
|
||||
|
||||
// Replace strings with placeholder
|
||||
{
|
||||
interface State {
|
||||
escaped: boolean;
|
||||
quoted?: string;
|
||||
offset: number;
|
||||
}
|
||||
const state: State = {escaped: false, offset: 0};
|
||||
for (let i = 0; i < selector.length; ++i) {
|
||||
if (state.escaped) {
|
||||
continue;
|
||||
}
|
||||
switch (selector[i]) {
|
||||
case '\\':
|
||||
state.escaped = true;
|
||||
break;
|
||||
case '"':
|
||||
case "'": {
|
||||
if (!state.quoted) {
|
||||
state.quoted = selector[i];
|
||||
state.offset = i;
|
||||
continue;
|
||||
}
|
||||
const quote = state.quoted;
|
||||
if (quote !== selector[i]) {
|
||||
continue;
|
||||
}
|
||||
delete state.quoted;
|
||||
const offset = state.offset;
|
||||
const value = selector.slice(state.offset, i + 1);
|
||||
replacements.push({value, offset});
|
||||
const replacement = `${quote}${'§'.repeat(value.length - 2)}${quote}`;
|
||||
selector =
|
||||
selector.slice(0, offset) +
|
||||
replacement +
|
||||
selector.slice(offset + value.length);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Replace parentheses with placeholder
|
||||
{
|
||||
interface State {
|
||||
escaped: boolean;
|
||||
nesting: number;
|
||||
offset: number;
|
||||
}
|
||||
const state: State = {escaped: false, nesting: 0, offset: 0};
|
||||
for (let i = 0; i < selector.length; ++i) {
|
||||
if (state.escaped) {
|
||||
continue;
|
||||
}
|
||||
switch (selector[i]) {
|
||||
case '\\':
|
||||
state.escaped = true;
|
||||
break;
|
||||
case '(':
|
||||
if (++state.nesting !== 1) {
|
||||
continue;
|
||||
}
|
||||
state.offset = i;
|
||||
break;
|
||||
case ')': {
|
||||
if (--state.nesting !== 0) {
|
||||
continue;
|
||||
}
|
||||
const {offset} = state;
|
||||
const value = selector.slice(offset, i + 1);
|
||||
replacements.push({value, offset});
|
||||
const replacement = `(${'¶'.repeat(value.length - 2)})`;
|
||||
selector =
|
||||
selector.slice(0, offset) +
|
||||
replacement +
|
||||
selector.slice(offset + value.length);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Our goal here is basically try each token type on the selector, keeping
|
||||
// track of order. Hopefully by the end, we have an array of tokens.
|
||||
const tokens: Array<Token | string> = [selector];
|
||||
for (const [type, pattern] of Object.entries(grammar)) {
|
||||
for (let i = 0; i < tokens.length; i++) {
|
||||
const token = tokens[i];
|
||||
if (typeof token !== 'string') {
|
||||
continue;
|
||||
}
|
||||
|
||||
pattern.lastIndex = 0;
|
||||
const match = pattern.exec(token);
|
||||
if (!match) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const from = match.index - 1;
|
||||
const args: Array<Token | string> = [];
|
||||
const content = match[0];
|
||||
|
||||
const before = token.slice(0, from + 1);
|
||||
if (before) {
|
||||
args.push(before);
|
||||
}
|
||||
|
||||
args.push({
|
||||
...(match.groups as unknown as Token),
|
||||
type,
|
||||
content,
|
||||
});
|
||||
|
||||
const after = token.slice(from + content.length + 1);
|
||||
if (after) {
|
||||
args.push(after);
|
||||
}
|
||||
|
||||
tokens.splice(i, 1, ...args);
|
||||
}
|
||||
}
|
||||
assertTokenArray(tokens);
|
||||
|
||||
// Replace placeholders in reverse order.
|
||||
for (const replacement of replacements.reverse()) {
|
||||
for (const token of tokens) {
|
||||
const {offset, value} = replacement;
|
||||
if (!(token.pos[0] <= offset && offset + value.length <= token.pos[1])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const {content} = token;
|
||||
const tokenOffset = offset - token.pos[0];
|
||||
token.content =
|
||||
content.slice(0, tokenOffset) +
|
||||
value +
|
||||
content.slice(tokenOffset + value.length);
|
||||
token.__changed = token.content !== content;
|
||||
}
|
||||
}
|
||||
|
||||
// Rematch tokens with changed content.
|
||||
for (const token of tokens) {
|
||||
if (!token.__changed) {
|
||||
continue;
|
||||
}
|
||||
delete token.__changed;
|
||||
|
||||
const pattern = getArgumentPatternByType(token.type);
|
||||
assert(pattern);
|
||||
pattern.lastIndex = 0;
|
||||
const match = pattern.exec(token.content);
|
||||
assert(match);
|
||||
Object.assign(token, match.groups);
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
86
test/src/tokenizer.spec.ts
Normal file
86
test/src/tokenizer.spec.ts
Normal file
@ -0,0 +1,86 @@
|
||||
/**
|
||||
* Copyright 2023 Google Inc. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import expect from 'expect';
|
||||
import {tokenize} from 'puppeteer-core/internal/injected/PSelectorTokenizer.js';
|
||||
|
||||
describe('PSelectorTokenizer', () => {
|
||||
it('should work', () => {
|
||||
expect(JSON.stringify(tokenize('#foo'))).toStrictEqual(
|
||||
'[{"name":"foo","type":"id","content":"#foo","pos":[0,4]}]'
|
||||
);
|
||||
});
|
||||
|
||||
it('should work with empty selectors', () => {
|
||||
expect(JSON.stringify(tokenize(''))).toStrictEqual('[]');
|
||||
});
|
||||
|
||||
it('should work with multiple strings', () => {
|
||||
expect(
|
||||
JSON.stringify(
|
||||
tokenize('[data-test-id^="test-"]:not([data-test-id^="test-foo"])')
|
||||
)
|
||||
).toStrictEqual(
|
||||
'[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]}]'
|
||||
);
|
||||
});
|
||||
|
||||
it('should work with multiple parentheses', () => {
|
||||
expect(
|
||||
JSON.stringify(
|
||||
tokenize(
|
||||
'[data-test-id^="test-"]:not([data-test-id^="test-foo"]) [data-test-id^="test-"]:not([data-test-id^="test-foo"])'
|
||||
)
|
||||
)
|
||||
).toStrictEqual(
|
||||
'[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]},{"type":"combinator","content":" ","pos":[55,56]},{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[56,79]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[79,111]}]'
|
||||
);
|
||||
});
|
||||
|
||||
it('should work with CSS escapes', () => {
|
||||
expect(
|
||||
JSON.stringify(tokenize('.mb-\\[max\\(-70\\%\\2c -23rem\\)\\]'))
|
||||
).toStrictEqual(
|
||||
'[{"name":"mb-\\\\[max\\\\(-70\\\\%\\\\2c","type":"class","content":".mb-\\\\[max\\\\(-70\\\\%\\\\2c","pos":[0,19]},{"type":"combinator","content":" ","pos":[19,20]},{"name":"-23rem\\\\)\\\\]","type":"type","content":"-23rem\\\\)\\\\]","pos":[20,30]}]'
|
||||
);
|
||||
});
|
||||
|
||||
it('should work with complex selectors', () => {
|
||||
expect(
|
||||
JSON.stringify(tokenize('a > b, c ~ d, a+b, e ::before ::after(a)'))
|
||||
).toStrictEqual(
|
||||
'[{"name":"a","type":"type","content":"a","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"name":"b","type":"type","content":"b","pos":[4,5]},{"type":"comma","content":",","pos":[5,7]},{"name":"c","type":"type","content":"c","pos":[7,8]},{"type":"combinator","content":"~","pos":[8,11]},{"name":"d","type":"type","content":"d","pos":[11,12]},{"type":"comma","content":",","pos":[12,14]},{"name":"a","type":"type","content":"a","pos":[14,15]},{"type":"combinator","content":"+","pos":[15,16]},{"name":"b","type":"type","content":"b","pos":[16,17]},{"type":"comma","content":",","pos":[17,19]},{"name":"e","type":"type","content":"e","pos":[19,20]},{"type":"combinator","content":" ","pos":[20,21]},{"name":"before","type":"pseudo-element","content":"::before","pos":[21,29]},{"type":"combinator","content":" ","pos":[29,30]},{"name":"after","argument":"a","type":"pseudo-element","content":"::after(a)","pos":[30,40]}]'
|
||||
);
|
||||
});
|
||||
|
||||
it('should throw with invalid selectors', () => {
|
||||
expect(() => {
|
||||
tokenize('a[b');
|
||||
}).toThrow();
|
||||
expect(() => {
|
||||
tokenize('a(b');
|
||||
}).toThrow();
|
||||
expect(() => {
|
||||
tokenize('[');
|
||||
}).toThrow();
|
||||
});
|
||||
|
||||
it('should work with universal selectors', () => {
|
||||
expect(JSON.stringify(tokenize('* > *'))).toStrictEqual(
|
||||
'[{"type":"universal","content":"*","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"type":"universal","content":"*","pos":[4,5]}]'
|
||||
);
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue
Block a user