chore: use custom tokenizer (#9837)

2024-06-14 14:02:48 +00:00 · 2023-03-14 10:13:23 +01:00 · 2023-03-14 10:13:23 +01:00 · dd16ecae03
commit dd16ecae03
parent fd2f90008d
5 changed files with 370 additions and 31 deletions
--- a/package-lock.json
+++ b/package-lock.json
@ -7004,12 +7004,6 @@
        "url": "https://github.com/sponsors/sindresorhus"
      }
    },
-    "node_modules/parsel-js": {
-      "version": "1.0.3",
-      "resolved": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e",
-      "dev": true,
-      "license": "MIT"
-    },
    "node_modules/path-exists": {
      "version": "4.0.0",
      "license": "MIT",
@ -9242,8 +9236,7 @@
        "ws": "8.12.1"
      },
      "devDependencies": {
-        "mitt": "3.0.0",
-        "parsel-js": "github:jrandolf/parsel"
+        "mitt": "3.0.0"
      },
      "engines": {
        "node": ">=14.14.0"
@ -13822,11 +13815,6 @@
        "lines-and-columns": "^1.1.6"
      }
    },
-    "parsel-js": {
-      "version": "git+ssh://git@github.com/jrandolf/parsel.git#a52d21af14c0a8db4e17290d62500f555d5e183e",
-      "dev": true,
-      "from": "parsel-js@github:jrandolf/parsel"
-    },
    "path-exists": {
      "version": "4.0.0"
    },
@ -13992,7 +13980,6 @@
        "extract-zip": "2.0.1",
        "https-proxy-agent": "5.0.1",
        "mitt": "3.0.0",
-        "parsel-js": "github:jrandolf/parsel",
        "proxy-from-env": "1.1.0",
        "rimraf": "4.4.0",
        "tar-fs": "2.1.1",
--- a/packages/puppeteer-core/package.json
+++ b/packages/puppeteer-core/package.json
@ -152,7 +152,6 @@
    }
  },
  "devDependencies": {
-    "mitt": "3.0.0",
-    "parsel-js": "github:jrandolf/parsel"
+    "mitt": "3.0.0"
  }
 }
--- a/packages/puppeteer-core/src/injected/PSelectorParser.ts
+++ b/packages/puppeteer-core/src/injected/PSelectorParser.ts
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-import {tokenize, Tokens, TOKENS} from 'parsel-js';
+import {Token, tokenize, TokenType} from './PSelectorTokenizer.js';

 export type CSSSelector = string;
 export type PPseudoSelector = {
@ -29,13 +29,8 @@ export type CompoundPSelector = Array<CSSSelector | PPseudoSelector>;
 export type ComplexPSelector = Array<CompoundPSelector | PCombinator>;
 export type ComplexPSelectorList = ComplexPSelector[];

-TOKENS['combinator'] = new RegExp(
-  `${/\s*(?:>{3,4})\s*|/.source}${TOKENS['combinator']!.source}`,
-  'g'
-);
-
 class TokenSpan {
-  #tokens: Tokens[] = [];
+  #tokens: Token[] = [];
  #selector: string;

  constructor(selector: string) {
@ -46,13 +41,13 @@ class TokenSpan {
    return this.#tokens.length;
  }

-  add(token: Tokens) {
+  add(token: Token) {
    this.#tokens.push(token);
  }

  toStringAndClear() {
-    const startToken = this.#tokens[0] as Tokens;
-    const endToken = this.#tokens[this.#tokens.length - 1] as Tokens;
+    const startToken = this.#tokens[0] as Token;
+    const endToken = this.#tokens[this.#tokens.length - 1] as Token;
    this.#tokens.splice(0);
    return this.#selector.slice(startToken.pos[0], endToken.pos[1]);
  }
@ -89,9 +84,9 @@ export function parsePSelectors(
  const storage = new TokenSpan(selector);
  for (const token of tokens) {
    switch (token.type) {
-      case 'combinator':
+      case TokenType.Combinator:
        switch (token.content) {
-          case '>>>':
+          case PCombinator.Descendent:
            isPureCSS = false;
            if (storage.length) {
              compoundSelector.push(storage.toStringAndClear());
@ -100,7 +95,7 @@ export function parsePSelectors(
            complexSelector.push(PCombinator.Descendent);
            complexSelector.push(compoundSelector);
            continue;
-          case '>>>>':
+          case PCombinator.Child:
            isPureCSS = false;
            if (storage.length) {
              compoundSelector.push(storage.toStringAndClear());
@ -111,7 +106,7 @@ export function parsePSelectors(
            continue;
        }
        break;
-      case 'pseudo-element':
+      case TokenType.PseudoElement:
        if (!token.name.startsWith('-p-')) {
          break;
        }
@ -124,7 +119,7 @@ export function parsePSelectors(
          value: unquote(token.argument ?? ''),
        });
        continue;
-      case 'comma':
+      case TokenType.Comma:
        if (storage.length) {
          compoundSelector.push(storage.toStringAndClear());
        }
--- a/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts
+++ b/packages/puppeteer-core/src/injected/PSelectorTokenizer.ts
@ -0,0 +1,272 @@
+/**
+ * Copyright (c) 2020 Lea Verou
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+import {assert} from '../util/assert.js';
+
+export const enum TokenType {
+  Class = 'class',
+  Attribute = 'attribute',
+  Id = 'id',
+  Type = 'type',
+  Universal = 'universal',
+  PseudoElement = 'pseudo-element',
+  PseudoClass = 'pseudo-class',
+  Comma = 'comma',
+  Combinator = 'combinator',
+}
+
+export interface Token {
+  type: string;
+  content: string;
+  name: string;
+  namespace?: string;
+  value?: string;
+  pos: [number, number];
+  operator?: string;
+  argument?: string;
+  caseSensitive?: 'i';
+  /**
+   * @internal
+   */
+  __changed?: boolean;
+}
+
+const TOKENS: Record<string, RegExp> = {
+  [TokenType.Attribute]:
+    /\[\s*(?:(?<namespace>(?:\\.|[-\w\P{ASCII}])+|\*)?\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)\s*(?:(?<operator>\W?=)\s*(?<value>.+?)\s*(\s(?<caseSensitive>[iIsS]))?\s*)?\]/gu,
+  [TokenType.Id]: /#(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
+  [TokenType.Class]: /\.(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
+  [TokenType.Comma]: /\s*,\s*/g,
+  [TokenType.Combinator]: /\s*(?:>{3,4}|[\s>+~])\s*/g,
+  [TokenType.PseudoElement]:
+    /::(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>¶+)\))?/gu,
+  [TokenType.PseudoClass]:
+    /:(?<name>(?:\\.|[-\w\P{ASCII}])+)(?:\((?<argument>¶+)\))?/gu,
+  [TokenType.Universal]: /(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?\*/gu,
+  [TokenType.Type]:
+    /(?:(?<namespace>\*|(?:\\.|[-\w\P{ASCII}])*)\|)?(?<name>(?:\\.|[-\w\P{ASCII}])+)/gu,
+};
+
+const getArgumentPatternByType = (type: string) => {
+  switch (type) {
+    case TokenType.PseudoElement:
+    case TokenType.PseudoClass:
+      return new RegExp(
+        TOKENS[type]!.source.replace('(?<argument>¶+)', '(?<argument>.+)'),
+        'gu'
+      );
+    default:
+      return TOKENS[type];
+  }
+};
+
+function assertTokenArray(
+  tokens: Array<Token | string>
+): asserts tokens is Token[] {
+  let offset = 0;
+  for (const token of tokens) {
+    switch (typeof token) {
+      case 'string':
+        throw new Error(
+          `Unexpected sequence ${token} found at index ${offset}`
+        );
+      case 'object':
+        offset += token.content.length;
+        token.pos = [offset - token.content.length, offset];
+        switch (token.type) {
+          case TokenType.Combinator:
+          case TokenType.Comma:
+            token.content = token.content.trim() || ' ';
+            break;
+        }
+        break;
+    }
+  }
+}
+
+export function tokenize(selector: string, grammar = TOKENS): Token[] {
+  if (!selector) {
+    return [];
+  }
+  selector = selector.trim();
+
+  type Replacement = {value: string; offset: number};
+  const replacements: Replacement[] = [];
+
+  // Replace strings with placeholder
+  {
+    interface State {
+      escaped: boolean;
+      quoted?: string;
+      offset: number;
+    }
+    const state: State = {escaped: false, offset: 0};
+    for (let i = 0; i < selector.length; ++i) {
+      if (state.escaped) {
+        continue;
+      }
+      switch (selector[i]) {
+        case '\\':
+          state.escaped = true;
+          break;
+        case '"':
+        case "'": {
+          if (!state.quoted) {
+            state.quoted = selector[i];
+            state.offset = i;
+            continue;
+          }
+          const quote = state.quoted;
+          if (quote !== selector[i]) {
+            continue;
+          }
+          delete state.quoted;
+          const offset = state.offset;
+          const value = selector.slice(state.offset, i + 1);
+          replacements.push({value, offset});
+          const replacement = `${quote}${'§'.repeat(value.length - 2)}${quote}`;
+          selector =
+            selector.slice(0, offset) +
+            replacement +
+            selector.slice(offset + value.length);
+          break;
+        }
+      }
+    }
+  }
+
+  // Replace parentheses with placeholder
+  {
+    interface State {
+      escaped: boolean;
+      nesting: number;
+      offset: number;
+    }
+    const state: State = {escaped: false, nesting: 0, offset: 0};
+    for (let i = 0; i < selector.length; ++i) {
+      if (state.escaped) {
+        continue;
+      }
+      switch (selector[i]) {
+        case '\\':
+          state.escaped = true;
+          break;
+        case '(':
+          if (++state.nesting !== 1) {
+            continue;
+          }
+          state.offset = i;
+          break;
+        case ')': {
+          if (--state.nesting !== 0) {
+            continue;
+          }
+          const {offset} = state;
+          const value = selector.slice(offset, i + 1);
+          replacements.push({value, offset});
+          const replacement = `(${'¶'.repeat(value.length - 2)})`;
+          selector =
+            selector.slice(0, offset) +
+            replacement +
+            selector.slice(offset + value.length);
+          break;
+        }
+      }
+    }
+  }
+
+  // Our goal here is basically try each token type on the selector, keeping
+  // track of order. Hopefully by the end, we have an array of tokens.
+  const tokens: Array<Token | string> = [selector];
+  for (const [type, pattern] of Object.entries(grammar)) {
+    for (let i = 0; i < tokens.length; i++) {
+      const token = tokens[i];
+      if (typeof token !== 'string') {
+        continue;
+      }
+
+      pattern.lastIndex = 0;
+      const match = pattern.exec(token);
+      if (!match) {
+        continue;
+      }
+
+      const from = match.index - 1;
+      const args: Array<Token | string> = [];
+      const content = match[0];
+
+      const before = token.slice(0, from + 1);
+      if (before) {
+        args.push(before);
+      }
+
+      args.push({
+        ...(match.groups as unknown as Token),
+        type,
+        content,
+      });
+
+      const after = token.slice(from + content.length + 1);
+      if (after) {
+        args.push(after);
+      }
+
+      tokens.splice(i, 1, ...args);
+    }
+  }
+  assertTokenArray(tokens);
+
+  // Replace placeholders in reverse order.
+  for (const replacement of replacements.reverse()) {
+    for (const token of tokens) {
+      const {offset, value} = replacement;
+      if (!(token.pos[0] <= offset && offset + value.length <= token.pos[1])) {
+        continue;
+      }
+
+      const {content} = token;
+      const tokenOffset = offset - token.pos[0];
+      token.content =
+        content.slice(0, tokenOffset) +
+        value +
+        content.slice(tokenOffset + value.length);
+      token.__changed = token.content !== content;
+    }
+  }
+
+  // Rematch tokens with changed content.
+  for (const token of tokens) {
+    if (!token.__changed) {
+      continue;
+    }
+    delete token.__changed;
+
+    const pattern = getArgumentPatternByType(token.type);
+    assert(pattern);
+    pattern.lastIndex = 0;
+    const match = pattern.exec(token.content);
+    assert(match);
+    Object.assign(token, match.groups);
+  }
+
+  return tokens;
+}
--- a/test/src/tokenizer.spec.ts
+++ b/test/src/tokenizer.spec.ts
@ -0,0 +1,86 @@
+/**
+ * Copyright 2023 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import expect from 'expect';
+import {tokenize} from 'puppeteer-core/internal/injected/PSelectorTokenizer.js';
+
+describe('PSelectorTokenizer', () => {
+  it('should work', () => {
+    expect(JSON.stringify(tokenize('#foo'))).toStrictEqual(
+      '[{"name":"foo","type":"id","content":"#foo","pos":[0,4]}]'
+    );
+  });
+
+  it('should work with empty selectors', () => {
+    expect(JSON.stringify(tokenize(''))).toStrictEqual('[]');
+  });
+
+  it('should work with multiple strings', () => {
+    expect(
+      JSON.stringify(
+        tokenize('[data-test-id^="test-"]:not([data-test-id^="test-foo"])')
+      )
+    ).toStrictEqual(
+      '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]}]'
+    );
+  });
+
+  it('should work with multiple parentheses', () => {
+    expect(
+      JSON.stringify(
+        tokenize(
+          '[data-test-id^="test-"]:not([data-test-id^="test-foo"]) [data-test-id^="test-"]:not([data-test-id^="test-foo"])'
+        )
+      )
+    ).toStrictEqual(
+      '[{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[0,23]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[23,55]},{"type":"combinator","content":" ","pos":[55,56]},{"name":"data-test-id","operator":"^=","value":"\\"test-\\"","type":"attribute","content":"[data-test-id^=\\"test-\\"]","pos":[56,79]},{"name":"not","argument":"[data-test-id^=\\"test-foo\\"]","type":"pseudo-class","content":":not([data-test-id^=\\"test-foo\\"])","pos":[79,111]}]'
+    );
+  });
+
+  it('should work with CSS escapes', () => {
+    expect(
+      JSON.stringify(tokenize('.mb-\\[max\\(-70\\%\\2c -23rem\\)\\]'))
+    ).toStrictEqual(
+      '[{"name":"mb-\\\\[max\\\\(-70\\\\%\\\\2c","type":"class","content":".mb-\\\\[max\\\\(-70\\\\%\\\\2c","pos":[0,19]},{"type":"combinator","content":" ","pos":[19,20]},{"name":"-23rem\\\\)\\\\]","type":"type","content":"-23rem\\\\)\\\\]","pos":[20,30]}]'
+    );
+  });
+
+  it('should work with complex selectors', () => {
+    expect(
+      JSON.stringify(tokenize('a > b, c ~ d, a+b, e ::before ::after(a)'))
+    ).toStrictEqual(
+      '[{"name":"a","type":"type","content":"a","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"name":"b","type":"type","content":"b","pos":[4,5]},{"type":"comma","content":",","pos":[5,7]},{"name":"c","type":"type","content":"c","pos":[7,8]},{"type":"combinator","content":"~","pos":[8,11]},{"name":"d","type":"type","content":"d","pos":[11,12]},{"type":"comma","content":",","pos":[12,14]},{"name":"a","type":"type","content":"a","pos":[14,15]},{"type":"combinator","content":"+","pos":[15,16]},{"name":"b","type":"type","content":"b","pos":[16,17]},{"type":"comma","content":",","pos":[17,19]},{"name":"e","type":"type","content":"e","pos":[19,20]},{"type":"combinator","content":" ","pos":[20,21]},{"name":"before","type":"pseudo-element","content":"::before","pos":[21,29]},{"type":"combinator","content":" ","pos":[29,30]},{"name":"after","argument":"a","type":"pseudo-element","content":"::after(a)","pos":[30,40]}]'
+    );
+  });
+
+  it('should throw with invalid selectors', () => {
+    expect(() => {
+      tokenize('a[b');
+    }).toThrow();
+    expect(() => {
+      tokenize('a(b');
+    }).toThrow();
+    expect(() => {
+      tokenize('[');
+    }).toThrow();
+  });
+
+  it('should work with universal selectors', () => {
+    expect(JSON.stringify(tokenize('* > *'))).toStrictEqual(
+      '[{"type":"universal","content":"*","pos":[0,1]},{"type":"combinator","content":">","pos":[1,4]},{"type":"universal","content":"*","pos":[4,5]}]'
+    );
+  });
+});