From dd16ecae03153b619ac7d84d848169579eb5710f Mon Sep 17 00:00:00 2001 From: jrandolf <101637635+jrandolf@users.noreply.github.com> Date: Tue, 14 Mar 2023 10:13:23 +0100 Subject: [PATCH] chore: use custom tokenizer (#9837) --- package-lock.json | 15 +- packages/puppeteer-core/package.json | 3 +- .../src/injected/PSelectorParser.ts | 25 +- .../src/injected/PSelectorTokenizer.ts | 272 ++++++++++++++++++ test/src/tokenizer.spec.ts | 86 ++++++ 5 files changed, 370 insertions(+), 31 deletions(-) create mode 100644 packages/puppeteer-core/src/injected/PSelectorTokenizer.ts create mode 100644 test/src/tokenizer.spec.ts diff --git a/package-lock.json b/package-lock.json index 1a9d4817..710345e7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7004,12 +7004,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/parsel-js": { - "version": "1.0.3", - "resolved": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e", - "dev": true, - "license": "MIT" - }, "node_modules/path-exists": { "version": "4.0.0", "license": "MIT", @@ -9242,8 +9236,7 @@ "ws": "8.12.1" }, "devDependencies": { - "mitt": "3.0.0", - "parsel-js": "github:jrandolf/parsel" + "mitt": "3.0.0" }, "engines": { "node": ">=14.14.0" @@ -13822,11 +13815,6 @@ "lines-and-columns": "^1.1.6" } }, - "parsel-js": { - "version": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e", - "dev": true, - "from": "parsel-js@github:jrandolf/parsel" - }, "path-exists": { "version": "4.0.0" }, @@ -13992,7 +13980,6 @@ "extract-zip": "2.0.1", "https-proxy-agent": "5.0.1", "mitt": "3.0.0", - "parsel-js": "github:jrandolf/parsel", "proxy-from-env": "1.1.0", "rimraf": "4.4.0", "tar-fs": "2.1.1", diff --git a/packages/puppeteer-core/package.json b/packages/puppeteer-core/package.json index 7c6e5d3a..b71e7538 100644 --- a/packages/puppeteer-core/package.json +++ b/packages/puppeteer-core/package.json @@ -152,7 +152,6 @@ } }, "devDependencies": { - "mitt": "3.0.0", - "parsel-js": "github:jrandolf/parsel" + "mitt": "3.0.0" } } diff --git a/packages/puppeteer-core/src/injected/PSelectorParser.ts b/packages/puppeteer-core/src/injected/PSelectorParser.ts index 7efe50a2..c575a816 100644 --- a/packages/puppeteer-core/src/injected/PSelectorParser.ts +++ b/packages/puppeteer-core/src/injected/PSelectorParser.ts @@ -14,7 +14,7 @@ * limitations under the License. */ -import {tokenize, Tokens, TOKENS} from 'parsel-js'; +import {Token, tokenize, TokenType} from './PSelectorTokenizer.js'; export type CSSSelector = string; export type PPseudoSelector = { @@ -29,13 +29,8 @@ export type CompoundPSelector = Array; export type ComplexPSelector = Array; export type ComplexPSelectorList = ComplexPSelector[]; -TOKENS['combinator'] = new RegExp( - `${/\s*(?:>{3,4})\s*|/.source}${TOKENS['combinator']!.source}`, - 'g' -); - class TokenSpan { - #tokens: Tokens[] = []; + #tokens: Token[] = []; #selector: string; constructor(selector: string) { @@ -46,13 +41,13 @@ class TokenSpan { return this.#tokens.length; } - add(token: Tokens) { + add(token: Token) { this.#tokens.push(token); } toStringAndClear() { - const startToken = this.#tokens[0] as Tokens; - const endToken = this.#tokens[this.#tokens.length - 1] as Tokens; + const startToken = this.#tokens[0] as Token; + const endToken = this.#tokens[this.#tokens.length - 1] as Token; this.#tokens.splice(0); return this.#selector.slice(startToken.pos[0], endToken.pos[1]); } @@ -89,9 +84,9 @@ export function parsePSelectors( const storage = new TokenSpan(selector); for (const token of tokens) { switch (token.type) { - case 'combinator': + case TokenType.Combinator: switch (token.content) { - case '>>>': + case PCombinator.Descendent: isPureCSS = false; if (storage.length) { compoundSelector.push(storage.toStringAndClear()); @@ -100,7 +95,7 @@ export function parsePSelectors( complexSelector.push(PCombinator.Descendent); complexSelector.push(compoundSelector); continue; - case '>>>>': + case PCombinator.Child: isPureCSS = false; if (storage.length) { compoundSelector.push(storage.toStringAndClear()); @@ -111,7 +106,7 @@ export function parsePSelectors( continue; } break; - case 'pseudo-element': + case TokenType.PseudoElement: if (!token.name.startsWith('-p-')) { break; } @@ -124,7 +119,7 @@ export function parsePSelectors( value: unquote(token.argument ?? ''), }); continue; - case 'comma': + case TokenType.Comma: if (storage.length) { compoundSelector.push(storage.toStringAndClear()); } diff --git a/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts b/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts new file mode 100644 index 00000000..38502e13 --- /dev/null +++ b/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts @@ -0,0 +1,272 @@ +/** + * Copyright (c) 2020 Lea Verou + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +import {assert} from '../util/assert.js'; + +export const enum TokenType { + Class = 'class', + Attribute = 'attribute', + Id = 'id', + Type = 'type', + Universal = 'universal', + PseudoElement = 'pseudo-element', + PseudoClass = 'pseudo-class', + Comma = 'comma', + Combinator = 'combinator', +} + +export interface Token { + type: string; + content: string; + name: string; + namespace?: string; + value?: string; + pos: [number, number]; + operator?: string; + argument?: string; + caseSensitive?: 'i'; + /** + * @internal + */ + __changed?: boolean; +} + +const TOKENS: Record = { + [TokenType.Attribute]: + /\[\s*(?:(?(?:\\.|[-\w\P{ASCII}])+|\*)?\|)?(?(?:\\.|[-\w\P{ASCII}])+)\s*(?:(?\W?=)\s*(?.+?)\s*(\s(?[iIsS]))?\s*)?\]/gu, + [TokenType.Id]: /#(?(?:\\.|[-\w\P{ASCII}])+)/gu, + [TokenType.Class]: /\.(?(?:\\.|[-\w\P{ASCII}])+)/gu, + [TokenType.Comma]: /\s*,\s*/g, + [TokenType.Combinator]: /\s*(?:>{3,4}|[\s>+~])\s*/g, + [TokenType.PseudoElement]: + /::(?(?:\\.|[-\w\P{ASCII}])+)(?:\((?¶+)\))?/gu, + [TokenType.PseudoClass]: + /:(?(?:\\.|[-\w\P{ASCII}])+)(?:\((?¶+)\))?/gu, + [TokenType.Universal]: /(?:(?\*|(?:\\.|[-\w\P{ASCII}])*)\|)?\*/gu, + [TokenType.Type]: + /(?:(?\*|(?:\\.|[-\w\P{ASCII}])*)\|)?(?(?:\\.|[-\w\P{ASCII}])+)/gu, +}; + +const getArgumentPatternByType = (type: string) => { + switch (type) { + case TokenType.PseudoElement: + case TokenType.PseudoClass: + return new RegExp( + TOKENS[type]!.source.replace('(?¶+)', '(?.+)'), + 'gu' + ); + default: + return TOKENS[type]; + } +}; + +function assertTokenArray( + tokens: Array +): asserts tokens is Token[] { + let offset = 0; + for (const token of tokens) { + switch (typeof token) { + case 'string': + throw new Error( + `Unexpected sequence ${token} found at index ${offset}` + ); + case 'object': + offset += token.content.length; + token.pos = [offset - token.content.length, offset]; + switch (token.type) { + case TokenType.Combinator: + case TokenType.Comma: + token.content = token.content.trim() || ' '; + break; + } + break; + } + } +} + +export function tokenize(selector: string, grammar = TOKENS): Token[] { + if (!selector) { + return []; + } + selector = selector.trim(); + + type Replacement = {value: string; offset: number}; + const replacements: Replacement[] = []; + + // Replace strings with placeholder + { + interface State { + escaped: boolean; + quoted?: string; + offset: number; + } + const state: State = {escaped: false, offset: 0}; + for (let i = 0; i < selector.length; ++i) { + if (state.escaped) { + continue; + } + switch (selector[i]) { + case '\\': + state.escaped = true; + break; + case '"': + case "'": { + if (!state.quoted) { + state.quoted = selector[i]; + state.offset = i; + continue; + } + const quote = state.quoted; + if (quote !== selector[i]) { + continue; + } + delete state.quoted; + const offset = state.offset; + const value = selector.slice(state.offset, i + 1); + replacements.push({value, offset}); + const replacement = `${quote}${'§'.repeat(value.length - 2)}${quote}`; + selector = + selector.slice(0, offset) + + replacement + + selector.slice(offset + value.length); + break; + } + } + } + } + + // Replace parentheses with placeholder + { + interface State { + escaped: boolean; + nesting: number; + offset: number; + } + const state: State = {escaped: false, nesting: 0, offset: 0}; + for (let i = 0; i < selector.length; ++i) { + if (state.escaped) { + continue; + } + switch (selector[i]) { + case '\\': + state.escaped = true; + break; + case '(': + if (++state.nesting !== 1) { + continue; + } + state.offset = i; + break; + case ')': { + if (--state.nesting !== 0) { + continue; + } + const {offset} = state; + const value = selector.slice(offset, i + 1); + replacements.push({value, offset}); + const replacement = `(${'¶'.repeat(value.length - 2)})`; + selector = + selector.slice(0, offset) + + replacement + + selector.slice(offset + value.length); + break; + } + } + } + } + + // Our goal here is basically try each token type on the selector, keeping + // track of order. Hopefully by the end, we have an array of tokens. + const tokens: Array = [selector]; + for (const [type, pattern] of Object.entries(grammar)) { + for (let i = 0; i < tokens.length; i++) { + const token = tokens[i]; + if (typeof token !== 'string') { + continue; + } + + pattern.lastIndex = 0; + const match = pattern.exec(token); + if (!match) { + continue; + } + + const from = match.index - 1; + const args: Array = []; + const content = match[0]; + + const before = token.slice(0, from + 1); + if (before) { + args.push(before); + } + + args.push({ + ...(match.groups as unknown as Token), + type, + content, + }); + + const after = token.slice(from + content.length + 1); + if (after) { + args.push(after); + } + + tokens.splice(i, 1, ...args); + } + } + assertTokenArray(tokens); + + // Replace placeholders in reverse order. + for (const replacement of replacements.reverse()) { + for (const token of tokens) { + const {offset, value} = replacement; + if (!(token.pos[0] <= offset && offset + value.length <= token.pos[1])) { + continue; + } + + const {content} = token; + const tokenOffset = offset - token.pos[0]; + token.content = + content.slice(0, tokenOffset) + + value + + content.slice(tokenOffset + value.length); + token.__changed = token.content !== content; + } + } + + // Rematch tokens with changed content. + for (const token of tokens) { + if (!token.__changed) { + continue; + } + delete token.__changed; + + const pattern = getArgumentPatternByType(token.type); + assert(pattern); + pattern.lastIndex = 0; + const match = pattern.exec(token.content); + assert(match); + Object.assign(token, match.groups); + } + + return tokens; +} diff --git a/test/src/tokenizer.spec.ts b/test/src/tokenizer.spec.ts new file mode 100644 index 00000000..106633d4 --- /dev/null +++ b/test/src/tokenizer.spec.ts @@ -0,0 +1,86 @@ +/** + * Copyright 2023 Google Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import expect from 'expect'; +import {tokenize} from 'puppeteer-core/internal/injected/PSelectorTokenizer.js'; + +describe('PSelectorTokenizer', () => { + it('should work', () => { + expect(JSON.stringify(tokenize('#foo'))).toStrictEqual( + '[{"name":"foo","type":"id","content":"#foo","pos":[0,4]}]' + ); + }); + + it('should work with empty selectors', () => { + expect(JSON.stringify(tokenize(''))).toStrictEqual('[]'); + }); + + it('should work with multiple strings', () => { + expect( + JSON.stringify( + tokenize('[data-test-id^="test-"]:not([data-test-id^="test-foo"])') + ) + ).toStrictEqual( + '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]}]' + ); + }); + + it('should work with multiple parentheses', () => { + expect( + JSON.stringify( + tokenize( + '[data-test-id^="test-"]:not([data-test-id^="test-foo"]) [data-test-id^="test-"]:not([data-test-id^="test-foo"])' + ) + ) + ).toStrictEqual( + '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]},{"type":"combinator","content":" ","pos":[55,56]},{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[56,79]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[79,111]}]' + ); + }); + + it('should work with CSS escapes', () => { + expect( + JSON.stringify(tokenize('.mb-\\[max\\(-70\\%\\2c -23rem\\)\\]')) + ).toStrictEqual( + '[{"name":"mb-\\\\[max\\\\(-70\\\\%\\\\2c","type":"class","content":".mb-\\\\[max\\\\(-70\\\\%\\\\2c","pos":[0,19]},{"type":"combinator","content":" ","pos":[19,20]},{"name":"-23rem\\\\)\\\\]","type":"type","content":"-23rem\\\\)\\\\]","pos":[20,30]}]' + ); + }); + + it('should work with complex selectors', () => { + expect( + JSON.stringify(tokenize('a > b, c ~ d, a+b, e ::before ::after(a)')) + ).toStrictEqual( + '[{"name":"a","type":"type","content":"a","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"name":"b","type":"type","content":"b","pos":[4,5]},{"type":"comma","content":",","pos":[5,7]},{"name":"c","type":"type","content":"c","pos":[7,8]},{"type":"combinator","content":"~","pos":[8,11]},{"name":"d","type":"type","content":"d","pos":[11,12]},{"type":"comma","content":",","pos":[12,14]},{"name":"a","type":"type","content":"a","pos":[14,15]},{"type":"combinator","content":"+","pos":[15,16]},{"name":"b","type":"type","content":"b","pos":[16,17]},{"type":"comma","content":",","pos":[17,19]},{"name":"e","type":"type","content":"e","pos":[19,20]},{"type":"combinator","content":" ","pos":[20,21]},{"name":"before","type":"pseudo-element","content":"::before","pos":[21,29]},{"type":"combinator","content":" ","pos":[29,30]},{"name":"after","argument":"a","type":"pseudo-element","content":"::after(a)","pos":[30,40]}]' + ); + }); + + it('should throw with invalid selectors', () => { + expect(() => { + tokenize('a[b'); + }).toThrow(); + expect(() => { + tokenize('a(b'); + }).toThrow(); + expect(() => { + tokenize('['); + }).toThrow(); + }); + + it('should work with universal selectors', () => { + expect(JSON.stringify(tokenize('* > *'))).toStrictEqual( + '[{"type":"universal","content":"*","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"type":"universal","content":"*","pos":[4,5]}]' + ); + }); +});