diff --git a/package-lock.json b/package-lock.json index d406bb663d1..475b83e90f1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7005,6 +7005,12 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parsel-js": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/parsel-js/-/parsel-js-1.1.0.tgz", + "integrity": "sha512-+CAY5A3p8b6he3OzlY/naXpeeiLMjEqFUyMwiPrwnemG5yh0/sgygYMKRhtn6/YSriyy4KZwQLnpBfs36GnXUg==", + "dev": true + }, "node_modules/path-exists": { "version": "4.0.0", "license": "MIT", @@ -9238,7 +9244,8 @@ "ws": "8.12.1" }, "devDependencies": { - "mitt": "3.0.0" + "mitt": "3.0.0", + "parsel-js": "1.1.0" }, "engines": { "node": ">=14.14.0" @@ -13820,6 +13827,12 @@ "lines-and-columns": "^1.1.6" } }, + "parsel-js": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/parsel-js/-/parsel-js-1.1.0.tgz", + "integrity": "sha512-+CAY5A3p8b6he3OzlY/naXpeeiLMjEqFUyMwiPrwnemG5yh0/sgygYMKRhtn6/YSriyy4KZwQLnpBfs36GnXUg==", + "dev": true + }, "path-exists": { "version": "4.0.0" }, @@ -13985,6 +13998,7 @@ "extract-zip": "2.0.1", "https-proxy-agent": "5.0.1", "mitt": "3.0.0", + "parsel-js": "1.1.0", "proxy-from-env": "1.1.0", "rimraf": "4.4.0", "tar-fs": "2.1.1", diff --git a/packages/puppeteer-core/package.json b/packages/puppeteer-core/package.json index 7d6483fea85..93823fb7736 100644 --- a/packages/puppeteer-core/package.json +++ b/packages/puppeteer-core/package.json @@ -152,6 +152,7 @@ } }, "devDependencies": { - "mitt": "3.0.0" + "mitt": "3.0.0", + "parsel-js": "1.1.0" } } diff --git a/packages/puppeteer-core/src/injected/PQuerySelector.ts b/packages/puppeteer-core/src/injected/PQuerySelector.ts index 8355c327a9f..976f16d21db 100644 --- a/packages/puppeteer-core/src/injected/PQuerySelector.ts +++ b/packages/puppeteer-core/src/injected/PQuerySelector.ts @@ -33,6 +33,8 @@ import {textQuerySelectorAll} from './TextQuerySelector.js'; import {deepChildren, deepDescendents} from './util.js'; import {xpathQuerySelectorAll} from './XPathQuerySelector.js'; +const IDENT_TOKEN_START = /[-\w\P{ASCII}*]/; + class SelectorError extends Error { constructor(selector: string, message: string) { super(`${selector} is not a valid selector: ${message}`); @@ -67,13 +69,6 @@ class PQueryEngine { // are used right after so we treat this selector specially. this.#next(); break; - default: - /** - * We add the space since `.foo` will interpolate incorrectly (see - * {@link PQueryAllEngine.query}). This is always equivalent. - */ - this.#selector = ` ${this.#selector}`; - break; } } @@ -84,7 +79,14 @@ class PQueryEngine { this.elements = AsyncIterableUtil.flatMap( this.elements, async function* (element) { - if (!element.parentElement) { + if (!selector[0]) { + return; + } + // The regular expression tests if the selector is a type/universal + // selector. Any other case means we want to apply the selector onto + // the element itself (e.g. `element.class`, `element>div`, + // `element:hover`, etc.). + if (IDENT_TOKEN_START.test(selector[0]) || !element.parentElement) { yield* (element as Element).querySelectorAll(selector); return; } @@ -97,7 +99,7 @@ class PQueryEngine { } } yield* element.parentElement.querySelectorAll( - `:scope > :nth-child(${index})${selector}` + `:scope>:nth-child(${index})${selector}` ); } ); diff --git a/packages/puppeteer-core/src/injected/PSelectorParser.ts b/packages/puppeteer-core/src/injected/PSelectorParser.ts index c575a81654b..19bb9e30007 100644 --- a/packages/puppeteer-core/src/injected/PSelectorParser.ts +++ b/packages/puppeteer-core/src/injected/PSelectorParser.ts @@ -14,7 +14,7 @@ * limitations under the License. */ -import {Token, tokenize, TokenType} from './PSelectorTokenizer.js'; +import {Token, tokenize, TOKENS, stringify} from 'parsel-js'; export type CSSSelector = string; export type PPseudoSelector = { @@ -29,29 +29,7 @@ export type CompoundPSelector = Array; export type ComplexPSelector = Array; export type ComplexPSelectorList = ComplexPSelector[]; -class TokenSpan { - #tokens: Token[] = []; - #selector: string; - - constructor(selector: string) { - this.#selector = selector; - } - - get length(): number { - return this.#tokens.length; - } - - add(token: Token) { - this.#tokens.push(token); - } - - toStringAndClear() { - const startToken = this.#tokens[0] as Token; - const endToken = this.#tokens[this.#tokens.length - 1] as Token; - this.#tokens.splice(0); - return this.#selector.slice(startToken.pos[0], endToken.pos[1]); - } -} +TOKENS['combinator'] = /\s*(>>>>?|[\s>+~])\s*/g; const ESCAPE_REGEXP = /\\[\s\S]/g; const unquote = (text: string): string => { @@ -81,15 +59,16 @@ export function parsePSelectors( let compoundSelector: CompoundPSelector = []; let complexSelector: ComplexPSelector = [compoundSelector]; const selectors: ComplexPSelectorList = [complexSelector]; - const storage = new TokenSpan(selector); + const storage: Token[] = []; for (const token of tokens) { switch (token.type) { - case TokenType.Combinator: + case 'combinator': switch (token.content) { case PCombinator.Descendent: isPureCSS = false; if (storage.length) { - compoundSelector.push(storage.toStringAndClear()); + compoundSelector.push(stringify(storage)); + storage.splice(0); } compoundSelector = []; complexSelector.push(PCombinator.Descendent); @@ -98,7 +77,8 @@ export function parsePSelectors( case PCombinator.Child: isPureCSS = false; if (storage.length) { - compoundSelector.push(storage.toStringAndClear()); + compoundSelector.push(stringify(storage)); + storage.splice(0); } compoundSelector = []; complexSelector.push(PCombinator.Child); @@ -106,32 +86,34 @@ export function parsePSelectors( continue; } break; - case TokenType.PseudoElement: + case 'pseudo-element': if (!token.name.startsWith('-p-')) { break; } isPureCSS = false; if (storage.length) { - compoundSelector.push(storage.toStringAndClear()); + compoundSelector.push(stringify(storage)); + storage.splice(0); } compoundSelector.push({ name: token.name.slice(3), value: unquote(token.argument ?? ''), }); continue; - case TokenType.Comma: + case 'comma': if (storage.length) { - compoundSelector.push(storage.toStringAndClear()); + compoundSelector.push(stringify(storage)); + storage.splice(0); } compoundSelector = []; complexSelector = [compoundSelector]; selectors.push(complexSelector); continue; } - storage.add(token); + storage.push(token); } if (storage.length) { - compoundSelector.push(storage.toStringAndClear()); + compoundSelector.push(stringify(storage)); } return [selectors, isPureCSS]; } diff --git a/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts b/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts deleted file mode 100644 index 38502e13873..00000000000 --- a/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts +++ /dev/null @@ -1,272 +0,0 @@ -/** - * Copyright (c) 2020 Lea Verou - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -import {assert} from '../util/assert.js'; - -export const enum TokenType { - Class = 'class', - Attribute = 'attribute', - Id = 'id', - Type = 'type', - Universal = 'universal', - PseudoElement = 'pseudo-element', - PseudoClass = 'pseudo-class', - Comma = 'comma', - Combinator = 'combinator', -} - -export interface Token { - type: string; - content: string; - name: string; - namespace?: string; - value?: string; - pos: [number, number]; - operator?: string; - argument?: string; - caseSensitive?: 'i'; - /** - * @internal - */ - __changed?: boolean; -} - -const TOKENS: Record = { - [TokenType.Attribute]: - /\[\s*(?:(?(?:\\.|[-\w\P{ASCII}])+|\*)?\|)?(?(?:\\.|[-\w\P{ASCII}])+)\s*(?:(?\W?=)\s*(?.+?)\s*(\s(?[iIsS]))?\s*)?\]/gu, - [TokenType.Id]: /#(?(?:\\.|[-\w\P{ASCII}])+)/gu, - [TokenType.Class]: /\.(?(?:\\.|[-\w\P{ASCII}])+)/gu, - [TokenType.Comma]: /\s*,\s*/g, - [TokenType.Combinator]: /\s*(?:>{3,4}|[\s>+~])\s*/g, - [TokenType.PseudoElement]: - /::(?(?:\\.|[-\w\P{ASCII}])+)(?:\((?¶+)\))?/gu, - [TokenType.PseudoClass]: - /:(?(?:\\.|[-\w\P{ASCII}])+)(?:\((?¶+)\))?/gu, - [TokenType.Universal]: /(?:(?\*|(?:\\.|[-\w\P{ASCII}])*)\|)?\*/gu, - [TokenType.Type]: - /(?:(?\*|(?:\\.|[-\w\P{ASCII}])*)\|)?(?(?:\\.|[-\w\P{ASCII}])+)/gu, -}; - -const getArgumentPatternByType = (type: string) => { - switch (type) { - case TokenType.PseudoElement: - case TokenType.PseudoClass: - return new RegExp( - TOKENS[type]!.source.replace('(?¶+)', '(?.+)'), - 'gu' - ); - default: - return TOKENS[type]; - } -}; - -function assertTokenArray( - tokens: Array -): asserts tokens is Token[] { - let offset = 0; - for (const token of tokens) { - switch (typeof token) { - case 'string': - throw new Error( - `Unexpected sequence ${token} found at index ${offset}` - ); - case 'object': - offset += token.content.length; - token.pos = [offset - token.content.length, offset]; - switch (token.type) { - case TokenType.Combinator: - case TokenType.Comma: - token.content = token.content.trim() || ' '; - break; - } - break; - } - } -} - -export function tokenize(selector: string, grammar = TOKENS): Token[] { - if (!selector) { - return []; - } - selector = selector.trim(); - - type Replacement = {value: string; offset: number}; - const replacements: Replacement[] = []; - - // Replace strings with placeholder - { - interface State { - escaped: boolean; - quoted?: string; - offset: number; - } - const state: State = {escaped: false, offset: 0}; - for (let i = 0; i < selector.length; ++i) { - if (state.escaped) { - continue; - } - switch (selector[i]) { - case '\\': - state.escaped = true; - break; - case '"': - case "'": { - if (!state.quoted) { - state.quoted = selector[i]; - state.offset = i; - continue; - } - const quote = state.quoted; - if (quote !== selector[i]) { - continue; - } - delete state.quoted; - const offset = state.offset; - const value = selector.slice(state.offset, i + 1); - replacements.push({value, offset}); - const replacement = `${quote}${'§'.repeat(value.length - 2)}${quote}`; - selector = - selector.slice(0, offset) + - replacement + - selector.slice(offset + value.length); - break; - } - } - } - } - - // Replace parentheses with placeholder - { - interface State { - escaped: boolean; - nesting: number; - offset: number; - } - const state: State = {escaped: false, nesting: 0, offset: 0}; - for (let i = 0; i < selector.length; ++i) { - if (state.escaped) { - continue; - } - switch (selector[i]) { - case '\\': - state.escaped = true; - break; - case '(': - if (++state.nesting !== 1) { - continue; - } - state.offset = i; - break; - case ')': { - if (--state.nesting !== 0) { - continue; - } - const {offset} = state; - const value = selector.slice(offset, i + 1); - replacements.push({value, offset}); - const replacement = `(${'¶'.repeat(value.length - 2)})`; - selector = - selector.slice(0, offset) + - replacement + - selector.slice(offset + value.length); - break; - } - } - } - } - - // Our goal here is basically try each token type on the selector, keeping - // track of order. Hopefully by the end, we have an array of tokens. - const tokens: Array = [selector]; - for (const [type, pattern] of Object.entries(grammar)) { - for (let i = 0; i < tokens.length; i++) { - const token = tokens[i]; - if (typeof token !== 'string') { - continue; - } - - pattern.lastIndex = 0; - const match = pattern.exec(token); - if (!match) { - continue; - } - - const from = match.index - 1; - const args: Array = []; - const content = match[0]; - - const before = token.slice(0, from + 1); - if (before) { - args.push(before); - } - - args.push({ - ...(match.groups as unknown as Token), - type, - content, - }); - - const after = token.slice(from + content.length + 1); - if (after) { - args.push(after); - } - - tokens.splice(i, 1, ...args); - } - } - assertTokenArray(tokens); - - // Replace placeholders in reverse order. - for (const replacement of replacements.reverse()) { - for (const token of tokens) { - const {offset, value} = replacement; - if (!(token.pos[0] <= offset && offset + value.length <= token.pos[1])) { - continue; - } - - const {content} = token; - const tokenOffset = offset - token.pos[0]; - token.content = - content.slice(0, tokenOffset) + - value + - content.slice(tokenOffset + value.length); - token.__changed = token.content !== content; - } - } - - // Rematch tokens with changed content. - for (const token of tokens) { - if (!token.__changed) { - continue; - } - delete token.__changed; - - const pattern = getArgumentPatternByType(token.type); - assert(pattern); - pattern.lastIndex = 0; - const match = pattern.exec(token.content); - assert(match); - Object.assign(token, match.groups); - } - - return tokens; -} diff --git a/test/src/tokenizer.spec.ts b/test/src/tokenizer.spec.ts deleted file mode 100644 index 106633d4d84..00000000000 --- a/test/src/tokenizer.spec.ts +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Copyright 2023 Google Inc. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import expect from 'expect'; -import {tokenize} from 'puppeteer-core/internal/injected/PSelectorTokenizer.js'; - -describe('PSelectorTokenizer', () => { - it('should work', () => { - expect(JSON.stringify(tokenize('#foo'))).toStrictEqual( - '[{"name":"foo","type":"id","content":"#foo","pos":[0,4]}]' - ); - }); - - it('should work with empty selectors', () => { - expect(JSON.stringify(tokenize(''))).toStrictEqual('[]'); - }); - - it('should work with multiple strings', () => { - expect( - JSON.stringify( - tokenize('[data-test-id^="test-"]:not([data-test-id^="test-foo"])') - ) - ).toStrictEqual( - '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]}]' - ); - }); - - it('should work with multiple parentheses', () => { - expect( - JSON.stringify( - tokenize( - '[data-test-id^="test-"]:not([data-test-id^="test-foo"]) [data-test-id^="test-"]:not([data-test-id^="test-foo"])' - ) - ) - ).toStrictEqual( - '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]},{"type":"combinator","content":" ","pos":[55,56]},{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[56,79]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[79,111]}]' - ); - }); - - it('should work with CSS escapes', () => { - expect( - JSON.stringify(tokenize('.mb-\\[max\\(-70\\%\\2c -23rem\\)\\]')) - ).toStrictEqual( - '[{"name":"mb-\\\\[max\\\\(-70\\\\%\\\\2c","type":"class","content":".mb-\\\\[max\\\\(-70\\\\%\\\\2c","pos":[0,19]},{"type":"combinator","content":" ","pos":[19,20]},{"name":"-23rem\\\\)\\\\]","type":"type","content":"-23rem\\\\)\\\\]","pos":[20,30]}]' - ); - }); - - it('should work with complex selectors', () => { - expect( - JSON.stringify(tokenize('a > b, c ~ d, a+b, e ::before ::after(a)')) - ).toStrictEqual( - '[{"name":"a","type":"type","content":"a","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"name":"b","type":"type","content":"b","pos":[4,5]},{"type":"comma","content":",","pos":[5,7]},{"name":"c","type":"type","content":"c","pos":[7,8]},{"type":"combinator","content":"~","pos":[8,11]},{"name":"d","type":"type","content":"d","pos":[11,12]},{"type":"comma","content":",","pos":[12,14]},{"name":"a","type":"type","content":"a","pos":[14,15]},{"type":"combinator","content":"+","pos":[15,16]},{"name":"b","type":"type","content":"b","pos":[16,17]},{"type":"comma","content":",","pos":[17,19]},{"name":"e","type":"type","content":"e","pos":[19,20]},{"type":"combinator","content":" ","pos":[20,21]},{"name":"before","type":"pseudo-element","content":"::before","pos":[21,29]},{"type":"combinator","content":" ","pos":[29,30]},{"name":"after","argument":"a","type":"pseudo-element","content":"::after(a)","pos":[30,40]}]' - ); - }); - - it('should throw with invalid selectors', () => { - expect(() => { - tokenize('a[b'); - }).toThrow(); - expect(() => { - tokenize('a(b'); - }).toThrow(); - expect(() => { - tokenize('['); - }).toThrow(); - }); - - it('should work with universal selectors', () => { - expect(JSON.stringify(tokenize('* > *'))).toStrictEqual( - '[{"type":"universal","content":"*","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"type":"universal","content":"*","pos":[4,5]}]' - ); - }); -});