chore: use parsel (#9865)

2023-03-16 14:41:38 +01:00 · 2023-03-16 14:41:38 +01:00 · 9f9394ca42
commit 9f9394ca42
parent b8d38cb05f
6 changed files with 44 additions and 403 deletions
--- a/package-lock.json
+++ b/package-lock.json
@ -7005,6 +7005,12 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
    "node_modules/parsel-js": {
      "version": "1.1.0",
      "resolved": "https://registry.npmjs.org/parsel-js/-/parsel-js-1.1.0.tgz",
      "integrity": "sha512-+CAY5A3p8b6he3OzlY/naXpeeiLMjEqFUyMwiPrwnemG5yh0/sgygYMKRhtn6/YSriyy4KZwQLnpBfs36GnXUg==",
      "dev": true
    },
    "node_modules/path-exists": {
      "version": "4.0.0",
      "license": "MIT",
@ -9238,7 +9244,8 @@
        "ws": "8.12.1"
      },
      "devDependencies": {
-        "mitt": "3.0.0"
+        "mitt": "3.0.0",
        "parsel-js": "1.1.0"
      },
      "engines": {
        "node": ">=14.14.0"
@ -13820,6 +13827,12 @@
        "lines-and-columns": "^1.1.6"
      }
    },
    "parsel-js": {
      "version": "1.1.0",
      "resolved": "https://registry.npmjs.org/parsel-js/-/parsel-js-1.1.0.tgz",
      "integrity": "sha512-+CAY5A3p8b6he3OzlY/naXpeeiLMjEqFUyMwiPrwnemG5yh0/sgygYMKRhtn6/YSriyy4KZwQLnpBfs36GnXUg==",
      "dev": true
    },
    "path-exists": {
      "version": "4.0.0"
    },
@ -13985,6 +13998,7 @@
        "extract-zip": "2.0.1",
        "https-proxy-agent": "5.0.1",
        "mitt": "3.0.0",
        "parsel-js": "1.1.0",
        "proxy-from-env": "1.1.0",
        "rimraf": "4.4.0",
        "tar-fs": "2.1.1",
--- a/packages/puppeteer-core/package.json
+++ b/packages/puppeteer-core/package.json
@ -152,6 +152,7 @@
    }
  },
  "devDependencies": {
-    "mitt": "3.0.0"
+    "mitt": "3.0.0",
    "parsel-js": "1.1.0"
  }
 }
--- a/packages/puppeteer-core/src/injected/PQuerySelector.ts
+++ b/packages/puppeteer-core/src/injected/PQuerySelector.ts
@ -33,6 +33,8 @@ import {textQuerySelectorAll} from './TextQuerySelector.js';
 import {deepChildren, deepDescendents} from './util.js';
 import {xpathQuerySelectorAll} from './XPathQuerySelector.js';
 const IDENT_TOKEN_START = /[-\w\P{ASCII}*]/;
 class SelectorError extends Error {
  constructor(selector: string, message: string) {
    super(`${selector} is not a valid selector: ${message}`);
@ -67,13 +69,6 @@ class PQueryEngine {
          // are used right after so we treat this selector specially.
          this.#next();
          break;
        default:
          /**
           * We add the space since `.foo` will interpolate incorrectly (see
           * {@link PQueryAllEngine.query}). This is always equivalent.
           */
          this.#selector = ` ${this.#selector}`;
          break;
      }
    }
@ -84,7 +79,14 @@ class PQueryEngine {
        this.elements = AsyncIterableUtil.flatMap(
          this.elements,
          async function* (element) {
-            if (!element.parentElement) {
+            if (!selector[0]) {
              return;
            }
            // The regular expression tests if the selector is a type/universal
            // selector. Any other case means we want to apply the selector onto
            // the element itself (e.g. `element.class`, `element>div`,
            // `element:hover`, etc.).
            if (IDENT_TOKEN_START.test(selector[0]) || !element.parentElement) {
              yield* (element as Element).querySelectorAll(selector);
              return;
            }
--- a/packages/puppeteer-core/src/injected/PSelectorParser.ts
+++ b/packages/puppeteer-core/src/injected/PSelectorParser.ts
@ -14,7 +14,7 @@
 * limitations under the License.
 */
-import {Token, tokenize, TokenType} from './PSelectorTokenizer.js';
+import {Token, tokenize, TOKENS, stringify} from 'parsel-js';
 export type CSSSelector = string;
 export type PPseudoSelector = {
@ -29,29 +29,7 @@ export type CompoundPSelector = Array<CSSSelector | PPseudoSelector>;
 export type ComplexPSelector = Array<CompoundPSelector | PCombinator>;
 export type ComplexPSelectorList = ComplexPSelector[];
-class TokenSpan {
+TOKENS['combinator'] = /\s*(>>>>?|[\s>+~])\s*/g;
  #tokens: Token[] = [];
  #selector: string;
  constructor(selector: string) {
    this.#selector = selector;
  }
  get length(): number {
    return this.#tokens.length;
  }
  add(token: Token) {
    this.#tokens.push(token);
  }
  toStringAndClear() {
    const startToken = this.#tokens[0] as Token;
    const endToken = this.#tokens[this.#tokens.length - 1] as Token;
    this.#tokens.splice(0);
    return this.#selector.slice(startToken.pos[0], endToken.pos[1]);
  }
 }
 const ESCAPE_REGEXP = /\\[\s\S]/g;
 const unquote = (text: string): string => {
@ -81,15 +59,16 @@ export function parsePSelectors(
  let compoundSelector: CompoundPSelector = [];
  let complexSelector: ComplexPSelector = [compoundSelector];
  const selectors: ComplexPSelectorList = [complexSelector];
-  const storage = new TokenSpan(selector);
+  const storage: Token[] = [];
  for (const token of tokens) {
    switch (token.type) {
-      case TokenType.Combinator:
+      case 'combinator':
        switch (token.content) {
          case PCombinator.Descendent:
            isPureCSS = false;
            if (storage.length) {
-              compoundSelector.push(storage.toStringAndClear());
+              compoundSelector.push(stringify(storage));
              storage.splice(0);
            }
            compoundSelector = [];
            complexSelector.push(PCombinator.Descendent);
@ -98,7 +77,8 @@ export function parsePSelectors(
          case PCombinator.Child:
            isPureCSS = false;
            if (storage.length) {
-              compoundSelector.push(storage.toStringAndClear());
+              compoundSelector.push(stringify(storage));
              storage.splice(0);
            }
            compoundSelector = [];
            complexSelector.push(PCombinator.Child);
@ -106,32 +86,34 @@ export function parsePSelectors(
            continue;
        }
        break;
-      case TokenType.PseudoElement:
+      case 'pseudo-element':
        if (!token.name.startsWith('-p-')) {
          break;
        }
        isPureCSS = false;
        if (storage.length) {
-          compoundSelector.push(storage.toStringAndClear());
+          compoundSelector.push(stringify(storage));
          storage.splice(0);
        }
        compoundSelector.push({
          name: token.name.slice(3),
          value: unquote(token.argument ?? ''),
        });
        continue;
-      case TokenType.Comma:
+      case 'comma':
        if (storage.length) {
-          compoundSelector.push(storage.toStringAndClear());
+          compoundSelector.push(stringify(storage));
          storage.splice(0);
        }
        compoundSelector = [];
        complexSelector = [compoundSelector];
        selectors.push(complexSelector);
        continue;
    }
-    storage.add(token);
+    storage.push(token);
  }
  if (storage.length) {
-    compoundSelector.push(storage.toStringAndClear());
+    compoundSelector.push(stringify(storage));
  }
  return [selectors, isPureCSS];
 }
--- a/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts
+++ b/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts
@ -1,272 +0,0 @@
 /**
 * Copyright (c) 2020 Lea Verou
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 import {assert} from '../util/assert.js';
 export const enum TokenType {
  Class = 'class',
  Attribute = 'attribute',
  Id = 'id',
  Type = 'type',
  Universal = 'universal',
  PseudoElement = 'pseudo-element',
  PseudoClass = 'pseudo-class',
  Comma = 'comma',
  Combinator = 'combinator',
 }
 export interface Token {
  type: string;
  content: string;
  name: string;
  namespace?: string;
  value?: string;
  pos: [number, number];
  operator?: string;
  argument?: string;
  caseSensitive?: 'i';
  /**
   * @internal
   */
  __changed?: boolean;
 }
 const TOKENS: Record<string, RegExp> = {
  [TokenType.Attribute]:
    /\[\s*(?:(?<namespace>(?:\\.|[-\w\P{ASCII}])+|\*)?\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)\s*(?:(?<operator>\W?=)\s*(?<value>.+?)\s*(\s(?<caseSensitive>[iIsS]))?\s*)?\]/gu,
  [TokenType.Id]: /#(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
  [TokenType.Class]: /\.(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
  [TokenType.Comma]: /\s*,\s*/g,
  [TokenType.Combinator]: /\s*(?:>{3,4}|[\s>+~])\s*/g,
  [TokenType.PseudoElement]:
    /::(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>¶+)\))?/gu,
  [TokenType.PseudoClass]:
    /:(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>¶+)\))?/gu,
  [TokenType.Universal]: /(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?\*/gu,
  [TokenType.Type]:
    /(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
 };
 const getArgumentPatternByType = (type: string) => {
  switch (type) {
    case TokenType.PseudoElement:
    case TokenType.PseudoClass:
      return new RegExp(
        TOKENS[type]!.source.replace('(?<argument>¶+)', '(?<argument>.+)'),
        'gu'
      );
    default:
      return TOKENS[type];
  }
 };
 function assertTokenArray(
  tokens: Array<Token | string>
 ): asserts tokens is Token[] {
  let offset = 0;
  for (const token of tokens) {
    switch (typeof token) {
      case 'string':
        throw new Error(
          `Unexpected sequence ${token} found at index ${offset}`
        );
      case 'object':
        offset += token.content.length;
        token.pos = [offset - token.content.length, offset];
        switch (token.type) {
          case TokenType.Combinator:
          case TokenType.Comma:
            token.content = token.content.trim() || ' ';
            break;
        }
        break;
    }
  }
 }
 export function tokenize(selector: string, grammar = TOKENS): Token[] {
  if (!selector) {
    return [];
  }
  selector = selector.trim();
  type Replacement = {value: string; offset: number};
  const replacements: Replacement[] = [];
  // Replace strings with placeholder
  {
    interface State {
      escaped: boolean;
      quoted?: string;
      offset: number;
    }
    const state: State = {escaped: false, offset: 0};
    for (let i = 0; i < selector.length; ++i) {
      if (state.escaped) {
        continue;
      }
      switch (selector[i]) {
        case '\\':
          state.escaped = true;
          break;
        case '"':
        case "'": {
          if (!state.quoted) {
            state.quoted = selector[i];
            state.offset = i;
            continue;
          }
          const quote = state.quoted;
          if (quote !== selector[i]) {
            continue;
          }
          delete state.quoted;
          const offset = state.offset;
          const value = selector.slice(state.offset, i + 1);
          replacements.push({value, offset});
          const replacement = `${quote}${'§'.repeat(value.length - 2)}${quote}`;
          selector =
            selector.slice(0, offset) +
            replacement +
            selector.slice(offset + value.length);
          break;
        }
      }
    }
  }
  // Replace parentheses with placeholder
  {
    interface State {
      escaped: boolean;
      nesting: number;
      offset: number;
    }
    const state: State = {escaped: false, nesting: 0, offset: 0};
    for (let i = 0; i < selector.length; ++i) {
      if (state.escaped) {
        continue;
      }
      switch (selector[i]) {
        case '\\':
          state.escaped = true;
          break;
        case '(':
          if (++state.nesting !== 1) {
            continue;
          }
          state.offset = i;
          break;
        case ')': {
          if (--state.nesting !== 0) {
            continue;
          }
          const {offset} = state;
          const value = selector.slice(offset, i + 1);
          replacements.push({value, offset});
          const replacement = `(${'¶'.repeat(value.length - 2)})`;
          selector =
            selector.slice(0, offset) +
            replacement +
            selector.slice(offset + value.length);
          break;
        }
      }
    }
  }
  // Our goal here is basically try each token type on the selector, keeping
  // track of order. Hopefully by the end, we have an array of tokens.
  const tokens: Array<Token | string> = [selector];
  for (const [type, pattern] of Object.entries(grammar)) {
    for (let i = 0; i < tokens.length; i++) {
      const token = tokens[i];
      if (typeof token !== 'string') {
        continue;
      }
      pattern.lastIndex = 0;
      const match = pattern.exec(token);
      if (!match) {
        continue;
      }
      const from = match.index - 1;
      const args: Array<Token | string> = [];
      const content = match[0];
      const before = token.slice(0, from + 1);
      if (before) {
        args.push(before);
      }
      args.push({
        ...(match.groups as unknown as Token),
        type,
        content,
      });
      const after = token.slice(from + content.length + 1);
      if (after) {
        args.push(after);
      }
      tokens.splice(i, 1, ...args);
    }
  }
  assertTokenArray(tokens);
  // Replace placeholders in reverse order.
  for (const replacement of replacements.reverse()) {
    for (const token of tokens) {
      const {offset, value} = replacement;
      if (!(token.pos[0] <= offset && offset + value.length <= token.pos[1])) {
        continue;
      }
      const {content} = token;
      const tokenOffset = offset - token.pos[0];
      token.content =
        content.slice(0, tokenOffset) +
        value +
        content.slice(tokenOffset + value.length);
      token.__changed = token.content !== content;
    }
  }
  // Rematch tokens with changed content.
  for (const token of tokens) {
    if (!token.__changed) {
      continue;
    }
    delete token.__changed;
    const pattern = getArgumentPatternByType(token.type);
    assert(pattern);
    pattern.lastIndex = 0;
    const match = pattern.exec(token.content);
    assert(match);
    Object.assign(token, match.groups);
  }
  return tokens;
 }
--- a/test/src/tokenizer.spec.ts
+++ b/test/src/tokenizer.spec.ts
@ -1,86 +0,0 @@
 /**
 * Copyright 2023 Google Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import expect from 'expect';
 import {tokenize} from 'puppeteer-core/internal/injected/PSelectorTokenizer.js';
 describe('PSelectorTokenizer', () => {
  it('should work', () => {
    expect(JSON.stringify(tokenize('#foo'))).toStrictEqual(
      '[{"name":"foo","type":"id","content":"#foo","pos":[0,4]}]'
    );
  });
  it('should work with empty selectors', () => {
    expect(JSON.stringify(tokenize(''))).toStrictEqual('[]');
  });
  it('should work with multiple strings', () => {
    expect(
      JSON.stringify(
        tokenize('[data-test-id^="test-"]:not([data-test-id^="test-foo"])')
      )
    ).toStrictEqual(
      '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]}]'
    );
  });
  it('should work with multiple parentheses', () => {
    expect(
      JSON.stringify(
        tokenize(
          '[data-test-id^="test-"]:not([data-test-id^="test-foo"]) [data-test-id^="test-"]:not([data-test-id^="test-foo"])'
        )
      )
    ).toStrictEqual(
      '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]},{"type":"combinator","content":" ","pos":[55,56]},{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[56,79]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[79,111]}]'
    );
  });
  it('should work with CSS escapes', () => {
    expect(
      JSON.stringify(tokenize('.mb-\\[max\\(-70\\%\\2c -23rem\\)\\]'))
    ).toStrictEqual(
      '[{"name":"mb-\\\\[max\\\\(-70\\\\%\\\\2c","type":"class","content":".mb-\\\\[max\\\\(-70\\\\%\\\\2c","pos":[0,19]},{"type":"combinator","content":" ","pos":[19,20]},{"name":"-23rem\\\\)\\\\]","type":"type","content":"-23rem\\\\)\\\\]","pos":[20,30]}]'
    );
  });
  it('should work with complex selectors', () => {
    expect(
      JSON.stringify(tokenize('a > b, c ~ d, a+b, e ::before ::after(a)'))
    ).toStrictEqual(
      '[{"name":"a","type":"type","content":"a","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"name":"b","type":"type","content":"b","pos":[4,5]},{"type":"comma","content":",","pos":[5,7]},{"name":"c","type":"type","content":"c","pos":[7,8]},{"type":"combinator","content":"~","pos":[8,11]},{"name":"d","type":"type","content":"d","pos":[11,12]},{"type":"comma","content":",","pos":[12,14]},{"name":"a","type":"type","content":"a","pos":[14,15]},{"type":"combinator","content":"+","pos":[15,16]},{"name":"b","type":"type","content":"b","pos":[16,17]},{"type":"comma","content":",","pos":[17,19]},{"name":"e","type":"type","content":"e","pos":[19,20]},{"type":"combinator","content":" ","pos":[20,21]},{"name":"before","type":"pseudo-element","content":"::before","pos":[21,29]},{"type":"combinator","content":" ","pos":[29,30]},{"name":"after","argument":"a","type":"pseudo-element","content":"::after(a)","pos":[30,40]}]'
    );
  });
  it('should throw with invalid selectors', () => {
    expect(() => {
      tokenize('a[b');
    }).toThrow();
    expect(() => {
      tokenize('a(b');
    }).toThrow();
    expect(() => {
      tokenize('[');
    }).toThrow();
  });
  it('should work with universal selectors', () => {
    expect(JSON.stringify(tokenize('* > *'))).toStrictEqual(
      '[{"type":"universal","content":"*","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"type":"universal","content":"*","pos":[4,5]}]'
    );
  });
 });