From f21cb0a2f47be3021c40f5125124a439e41bed7d Mon Sep 17 00:00:00 2001 From: Ilya Kantor Date: Wed, 4 Sep 2019 15:44:48 +0300 Subject: [PATCH 1/7] WIP --- .../01-regexp-introduction/article.md | 176 ++++++++---- .../02-regexp-character-classes/article.md | 189 ++++++++++++ .../love-html5-classes.svg | 0 .../1-find-time-hh-mm/solution.md | 6 - .../1-find-time-hh-mm/task.md | 8 - .../03-regexp-character-classes/article.md | 270 ------------------ .../03-regexp-unicode/article.md | 167 +++++++++++ .../1-start-end/solution.md | 3 +- .../1-start-end/task.md | 0 .../04-regexp-anchors/article.md | 52 ++++ .../05-regexp-multiline-mode/article.md | 87 ++++++ .../1-find-time-hh-mm/solution.md | 6 + .../1-find-time-hh-mm/task.md | 9 + .../06-regexp-boundary/article.md | 53 ++++ .../hello-java-boundaries.svg | 0 .../article.md | 2 +- .../1-find-range-1/solution.md | 0 .../1-find-range-1/task.md | 0 .../2-find-time-2-formats/solution.md | 0 .../2-find-time-2-formats/task.md | 0 .../article.md | 6 +- .../1-find-text-manydots/solution.md | 0 .../1-find-text-manydots/task.md | 0 .../2-find-html-colors-6hex/solution.md | 2 +- .../2-find-html-colors-6hex/task.md | 0 .../article.md | 2 +- .../1-lazy-greedy/solution.md | 0 .../1-lazy-greedy/task.md | 0 .../3-find-html-comments/solution.md | 0 .../3-find-html-comments/task.md | 0 .../4-find-html-tags-greedy-lazy/solution.md | 0 .../4-find-html-tags-greedy-lazy/task.md | 0 .../article.md | 0 .../witch_greedy1.svg | 0 .../witch_greedy2.svg | 0 .../witch_greedy3.svg | 0 .../witch_greedy4.svg | 0 .../witch_greedy5.svg | 0 .../witch_greedy6.svg | 0 .../witch_lazy3.svg | 0 .../witch_lazy4.svg | 0 .../witch_lazy5.svg | 0 .../witch_lazy6.svg | 0 .../1-find-webcolor-3-or-6/solution.md | 0 .../1-find-webcolor-3-or-6/task.md | 0 .../2-find-decimal-numbers/solution.md | 0 .../2-find-decimal-numbers/task.md | 0 .../5-parse-expression/solution.md | 0 .../5-parse-expression/task.md | 0 .../article.md | 0 .../regexp-nested-groups.svg | 0 .../12-regexp-anchors/2-test-mac/solution.md | 21 -- .../12-regexp-anchors/2-test-mac/task.md | 20 -- .../12-regexp-anchors/article.md | 55 ---- .../article.md | 0 .../01-find-programming-language/solution.md | 0 .../01-find-programming-language/task.md | 0 .../02-find-matching-bbtags/solution.md | 2 +- .../02-find-matching-bbtags/task.md | 0 .../03-match-quoted-string/solution.md | 0 .../03-match-quoted-string/task.md | 0 .../04-match-exact-tag/solution.md | 0 .../04-match-exact-tag/task.md | 0 .../article.md | 0 .../13-regexp-multiline-mode/article.md | 75 ----- .../14-regexp-lookahead-lookbehind/article.md | 8 +- .../article.md | 10 +- .../article.md | 26 +- .../20-regexp-unicode/article.md | 89 ------ .../21-regexp-unicode-properties/article.md | 86 ------ 9-regular-expressions/index.md | 4 - 71 files changed, 707 insertions(+), 727 deletions(-) create mode 100644 9-regular-expressions/02-regexp-character-classes/article.md rename 9-regular-expressions/{03-regexp-character-classes => 02-regexp-character-classes}/love-html5-classes.svg (100%) delete mode 100644 9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/solution.md delete mode 100644 9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/task.md delete mode 100644 9-regular-expressions/03-regexp-character-classes/article.md create mode 100644 9-regular-expressions/03-regexp-unicode/article.md rename 9-regular-expressions/{12-regexp-anchors => 04-regexp-anchors}/1-start-end/solution.md (77%) rename 9-regular-expressions/{12-regexp-anchors => 04-regexp-anchors}/1-start-end/task.md (100%) create mode 100644 9-regular-expressions/04-regexp-anchors/article.md create mode 100644 9-regular-expressions/05-regexp-multiline-mode/article.md create mode 100644 9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md create mode 100644 9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md create mode 100644 9-regular-expressions/06-regexp-boundary/article.md rename 9-regular-expressions/{03-regexp-character-classes => 06-regexp-boundary}/hello-java-boundaries.svg (100%) rename 9-regular-expressions/{04-regexp-escaping => 07-regexp-escaping}/article.md (96%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/1-find-range-1/solution.md (100%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/1-find-range-1/task.md (100%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/2-find-time-2-formats/solution.md (100%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/2-find-time-2-formats/task.md (100%) rename 9-regular-expressions/{05-regexp-character-sets-and-ranges => 08-regexp-character-sets-and-ranges}/article.md (97%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/1-find-text-manydots/solution.md (100%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/1-find-text-manydots/task.md (100%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/2-find-html-colors-6hex/solution.md (91%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/2-find-html-colors-6hex/task.md (100%) rename 9-regular-expressions/{07-regexp-quantifiers => 09-regexp-quantifiers}/article.md (97%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/1-lazy-greedy/solution.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/1-lazy-greedy/task.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/3-find-html-comments/solution.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/3-find-html-comments/task.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/4-find-html-tags-greedy-lazy/solution.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/4-find-html-tags-greedy-lazy/task.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/article.md (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_greedy1.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_greedy2.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_greedy3.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_greedy4.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_greedy5.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_greedy6.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_lazy3.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_lazy4.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_lazy5.svg (100%) rename 9-regular-expressions/{08-regexp-greedy-and-lazy => 10-regexp-greedy-and-lazy}/witch_lazy6.svg (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/1-find-webcolor-3-or-6/solution.md (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/1-find-webcolor-3-or-6/task.md (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/2-find-decimal-numbers/solution.md (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/2-find-decimal-numbers/task.md (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/5-parse-expression/solution.md (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/5-parse-expression/task.md (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/article.md (100%) rename 9-regular-expressions/{09-regexp-groups => 11-regexp-groups}/regexp-nested-groups.svg (100%) delete mode 100644 9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md delete mode 100644 9-regular-expressions/12-regexp-anchors/2-test-mac/task.md delete mode 100644 9-regular-expressions/12-regexp-anchors/article.md rename 9-regular-expressions/{10-regexp-backreferences => 12-regexp-backreferences}/article.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/01-find-programming-language/solution.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/01-find-programming-language/task.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/02-find-matching-bbtags/solution.md (79%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/02-find-matching-bbtags/task.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/03-match-quoted-string/solution.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/03-match-quoted-string/task.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/04-match-exact-tag/solution.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/04-match-exact-tag/task.md (100%) rename 9-regular-expressions/{11-regexp-alternation => 13-regexp-alternation}/article.md (100%) delete mode 100644 9-regular-expressions/13-regexp-multiline-mode/article.md rename 9-regular-expressions/{22-regexp-sticky => 16-regexp-sticky}/article.md (83%) rename 9-regular-expressions/{02-regexp-methods => 17-regexp-methods}/article.md (93%) delete mode 100644 9-regular-expressions/20-regexp-unicode/article.md delete mode 100644 9-regular-expressions/21-regexp-unicode-properties/article.md diff --git a/9-regular-expressions/01-regexp-introduction/article.md b/9-regular-expressions/01-regexp-introduction/article.md index 1632a930..5cbe011a 100644 --- a/9-regular-expressions/01-regexp-introduction/article.md +++ b/9-regular-expressions/01-regexp-introduction/article.md @@ -2,7 +2,7 @@ Regular expressions is a powerful way to search and replace in text. -In JavaScript, they are available as `RegExp` object, and also integrated in methods of strings. +In JavaScript, they are available as [RegExp](mdn:js/RegExp) object, and also integrated in methods of strings. ## Regular Expressions @@ -23,35 +23,43 @@ regexp = /pattern/; // no flags regexp = /pattern/gmi; // with flags g,m and i (to be covered soon) ``` -Slashes `"/"` tell JavaScript that we are creating a regular expression. They play the same role as quotes for strings. +Slashes `pattern:/.../` tell JavaScript that we are creating a regular expression. They play the same role as quotes for strings. -## Usage +In both cases `regexp` becomes an object of the built-in `RegExp` class. -To search inside a string, we can use method [search](mdn:js/String/search). +The main difference between these two syntaxes is that slashes `pattern:/.../` do not allow to insert expressions (like strings with `${...}`). They are fully static. -Here's an example: +Slashes are used when we know the regular expression at the code writing time -- and that's the most common situation. While `new RegExp` is used when we need to create a regexp "on the fly", from a dynamically generated string, for instance: -```js run -let str = "I love JavaScript!"; // will search here +```js +let tag = prompt("What tag do you want to find?", "h2"); -let regexp = /love/; -alert( str.search(regexp) ); // 2 +let regexp = new RegExp(`<${tag}>`); // same as /

/ if answered "h2" in the prompt above ``` -The `str.search` method looks for the pattern `pattern:/love/` and returns the position inside the string. As we might guess, `pattern:/love/` is the simplest possible pattern. What it does is a simple substring search. +## Flags -The code above is the same as: +Regular expressions may have flags that affect the search. -```js run -let str = "I love JavaScript!"; // will search here +There are only 6 of them in JavaScript: -let substr = 'love'; -alert( str.search(substr) ); // 2 -``` +`pattern:i` +: With this flag the search is case-insensitive: no difference between `A` and `a` (see the example below). -So searching for `pattern:/love/` is the same as searching for `"love"`. +`pattern:g` +: With this flag the search looks for all matches, without it -- only the first one. -But that's only for now. Soon we'll create more complex regular expressions with much more searching power. +`pattern:m` +: Multiline mode (covered in the chapter ). + +`pattern:s` +: Enables "dotall" mode, that allows a dot `pattern:.` to match newline character `\n` (covered in the chapter ). + +`pattern:u` +: Enables full unicode support. The flag enables correct processing of surrogate pairs. More about that in the chapter . + +`pattern:y` +: "Sticky" mode: searching at the exact position in the text (covered in the chapter ) ```smart header="Colors" From here on the color scheme is: @@ -61,65 +69,109 @@ From here on the color scheme is: - result -- `match:green` ``` +## Searching: str.match -````smart header="When to use `new RegExp`?" -Normally we use the short syntax `/.../`. But it does not support variable insertions `${...}`. +As it was said previously, regular expressions are integrated with string methods. -On the other hand, `new RegExp` allows to construct a pattern dynamically from a string, so it's more flexible. +The method `str.match(regexp)` finds all matches of `regexp` in the string `str`. -Here's an example of a dynamically generated regexp: +It has 3 working modes: + +1. If the regular expression has flag `pattern:g`, it returns an array of all matches: + ```js run + let str = "We will, we will rock you"; + + alert( str.match(/we/gi) ); // We,we (an array of 2 matches) + ``` + Please note that both `match:We` and `match:we` are found, because flag `pattern:i` makes the regular expression case-insensitive. + +2. If there's no such flag it returns only the first match in the form of an array, with the full match at index `0` and some additional details in properties: + ```js run + let str = "We will, we will rock you"; + + let result = str.match(/we/i); // without flag g + + alert( result[0] ); // We (1st match) + alert( result.length ); // 1 + + // Details: + alert( result.index ); // 0 (position of the match) + alert( result.input ); // We will, we will rock you (source string) + ``` + The array may have other indexes, besides `0` if a part of the regular expression is enclosed in parentheses. We'll cover that in the chapter . + +3. And, finally, if there are no matches, `null` is returned (doesn't matter if there's flag `pattern:g` or not). + + That's a very important nuance. If there are no matches, we get not an empty array, but `null`. Forgetting about that may lead to errors, e.g.: + + ```js run + let matches = "JavaScript".match(/HTML/); // = null + + if (!matches.length) { // Error: Cannot read property 'length' of null + alert("Error in the line above"); + } + ``` + + If we'd like the result to be always an array, we can write it this way: + + ```js run + let matches = "JavaScript".match(/HTML/)*!* || []*/!*; + + if (!matches.length) { + alert("No matches"); // now it works + } + ``` + +## Replacing: str.replace + +The method `str.replace(regexp, replacement)` replaces matches with `regexp` in string `str` with `replacement` (all matches, if there's flag `pattern:g`, otherwise only the first one). + +For instance: ```js run -let tag = prompt("Which tag you want to search?", "h2"); -let regexp = new RegExp(`<${tag}>`); +// no flag g +alert( "We will, we will".replace(/we/i, "I") ); // I will, we will -// finds

by default -alert( "

".search(regexp)); +// with flag g +alert( "We will, we will".replace(/we/ig, "I") ); // I will, I will ``` -```` +The second argument is the `replacement` string. We can use special character combinations in it to insert fragments of the match: -## Flags +| Symbols | Action in the replacement string | +|--------|--------| +|`$&`|inserts the whole match| +|$`|inserts a part of the string before the match| +|`$'`|inserts a part of the string after the match| +|`$n`|if `n` is a 1-2 digit number, then it inserts the contents of n-th parentheses, more about it in the chapter | +|`$`|inserts the contents of the parentheses with the given `name`, more about it in the chapter | +|`$$`|inserts character `$` | -Regular expressions may have flags that affect the search. - -There are only 6 of them in JavaScript: - -`i` -: With this flag the search is case-insensitive: no difference between `A` and `a` (see the example below). - -`g` -: With this flag the search looks for all matches, without it -- only the first one (we'll see uses in the next chapter). - -`m` -: Multiline mode (covered in the chapter ). - -`s` -: "Dotall" mode, allows `.` to match newlines (covered in the chapter ). - -`u` -: Enables full unicode support. The flag enables correct processing of surrogate pairs. More about that in the chapter . - -`y` -: Sticky mode (covered in the chapter ) - -We'll cover all these flags further in the tutorial. - -For now, the simplest flag is `i`, here's an example: +An example with `pattern:$&`: ```js run -let str = "I love JavaScript!"; - -alert( str.search(/LOVE/i) ); // 2 (found lowercased) - -alert( str.search(/LOVE/) ); // -1 (nothing found without 'i' flag) +alert( "I love HTML".replace(/HTML/, "$& and JavaScript") ); // I love HTML and JavaScript ``` -So the `i` flag already makes regular expressions more powerful than a simple substring search. But there's so much more. We'll cover other flags and features in the next chapters. +## Testing: regexp.test +The method `regexp.test(str)` looks for at least one match, if found, returns `true`, otherwise `false`. + +```js run +let str = "I love JavaScript"; +let reg = /LOVE/i; + +alert( reg.test(str) ); // true +``` + +Further in this chapter we'll study more regular expressions, come across many other examples and also meet other methods. + +Full information about the methods is given in the article . ## Summary -- A regular expression consists of a pattern and optional flags: `g`, `i`, `m`, `u`, `s`, `y`. -- Without flags and special symbols that we'll study later, the search by a regexp is the same as a substring search. -- The method `str.search(regexp)` returns the index where the match is found or `-1` if there's no match. In the next chapter we'll see other methods. +- A regular expression consists of a pattern and optional flags: `pattern:g`, `pattern:i`, `pattern:m`, `pattern:u`, `pattern:s`, `pattern:y`. +- Without flags and special symbols that we'll study later, the search by a regexp is the same as a substring search. +- The method `str.match(regexp)` looks for matches: all of them if there's `pattern:g` flag, otherwise only the first one. +- The method `str.replace(regexp, replacement)` replaces matches with `regexp` by `replacement`: all of them if there's `pattern:g` flag, otherwise only the first one. +- The method `regexp.test(str)` returns `true` if there's at least one match, otherwise `false`. diff --git a/9-regular-expressions/02-regexp-character-classes/article.md b/9-regular-expressions/02-regexp-character-classes/article.md new file mode 100644 index 00000000..881b6ba2 --- /dev/null +++ b/9-regular-expressions/02-regexp-character-classes/article.md @@ -0,0 +1,189 @@ +# Character classes + +Consider a practical task -- we have a phone number like `"+7(903)-123-45-67"`, and we need to turn it into pure numbers: `79035419441`. + +To do so, we can find and remove anything that's not a number. Character classes can help with that. + +A *character class* is a special notation that matches any symbol from a certain set. + +For the start, let's explore the "digit" class. It's written as `pattern:\d` and corresponds to "any single digit". + +For instance, the let's find the first digit in the phone number: + +```js run +let str = "+7(903)-123-45-67"; + +let reg = /\d/; + +alert( str.match(reg) ); // 7 +``` + +Without the flag `pattern:g`, the regular expression only looks for the first match, that is the first digit `pattern:\d`. + +Let's add the `pattern:g` flag to find all digits: + +```js run +let str = "+7(903)-123-45-67"; + +let reg = /\d/g; + +alert( str.match(reg) ); // array of matches: 7,9,0,3,1,2,3,4,5,6,7 + +// let's make the digits-only phone number of them: +alert( str.match(reg).join('') ); // 79035419441 +``` + +That was a character class for digits. There are other character classes as well. + +Most used are: + +`pattern:\d` ("d" is from "digit") +: A digit: a character from `0` to `9`. + +`pattern:\s` ("s" is from "space") +: A space symbol: includes spaces, tabs `\t`, newlines `\n` and few other rare characters: `\v`, `\f` and `\r`. + +`pattern:\w` ("w" is from "word") +: A "wordly" character: either a letter of Latin alphabet or a digit or an underscore `_`. Non-Latin letters (like cyrillic or hindi) do not belong to `pattern:\w`. + +For instance, `pattern:\d\s\w` means a "digit" followed by a "space character" followed by a "wordly character", such as `match:1 a`. + +**A regexp may contain both regular symbols and character classes.** + +For instance, `pattern:CSS\d` matches a string `match:CSS` with a digit after it: + +```js run +let str = "Is there CSS4?"; +let reg = /CSS\d/ + +alert( str.match(reg) ); // CSS4 +``` + +Also we can use many character classes: + +```js run +alert( "I love HTML5!".match(/\s\w\w\w\w\d/) ); // ' HTML5' +``` + +The match (each regexp character class has the corresponding result character): + +![](love-html5-classes.svg) + +## Inverse classes + +For every character class there exists an "inverse class", denoted with the same letter, but uppercased. + +The "inverse" means that it matches all other characters, for instance: + +`pattern:\D` +: Non-digit: any character except `pattern:\d`, for instance a letter. + +`pattern:\S` +: Non-space: any character except `pattern:\s`, for instance a letter. + +`pattern:\W` +: Non-wordly character: anything but `pattern:\w`, e.g a non-latin letter or a space. + +In the beginning of the chapter we saw how to make a number-only phone number from a string like `subject:+7(903)-123-45-67`: find all digits and join them. + +```js run +let str = "+7(903)-123-45-67"; + +alert( str.match(/\d/g).join('') ); // 79031234567 +``` + +An alternative, shorter way is to find non-digits `pattern:\D` and remove them from the string: + +```js run +let str = "+7(903)-123-45-67"; + +alert( str.replace(/\D/g, "") ); // 79031234567 +``` + +## A dot is any character + +A dot `pattern:.` is a special character class that matches "any character except a newline". + +For instance: + +```js run +alert( "Z".match(/./) ); // Z +``` + +Or in the middle of a regexp: + +```js run +let reg = /CS.4/; + +alert( "CSS4".match(reg) ); // CSS4 +alert( "CS-4".match(reg) ); // CS-4 +alert( "CS 4".match(reg) ); // CS 4 (space is also a character) +``` + +Please note that a dot means "any character", but not the "absense of a character". There must be a character to match it: + +```js run +alert( "CS4".match(/CS.4/) ); // null, no match because there's no character for the dot +``` + +### Dot as literally any character with "s" flag + +Usually a dot doesn't match a newline character `\n`. + +For instance, the regexp `pattern:A.B` matches `match:A`, and then `match:B` with any character between them, except a newline `\n`: + +```js run +alert( "A\nB".match(/A.B/) ); // null (no match) +``` + +There are many situations when we'd like a dot to mean literally "any character", newline included. + +That's what flag `pattern:s` does. If a regexp has it, then a dot `pattern:.` matches literally any character: + +```js run +alert( "A\nB".match(/A.B/s) ); // A\nB (match!) +``` + +````warn header="Pay attention to spaces" +Usually we pay little attention to spaces. For us strings `subject:1-5` and `subject:1 - 5` are nearly identical. + +But if a regexp doesn't take spaces into account, it may fail to work. + +Let's try to find digits separated by a hyphen: + +```js run +alert( "1 - 5".match(/\d-\d/) ); // null, no match! +``` + +Let's fix it adding spaces into the regexp `pattern:\d - \d`: + +```js run +alert( "1 - 5".match(/\d - \d/) ); // 1 - 5, now it works +// or we can use \s class: +alert( "1 - 5".match(/\d\s-\s\d/) ); // 1 - 5, also works +``` + +**A space is a character. Equal in importance with any other character.** + +We can't add or remove spaces from a regular expression and expect to work the same. + +In other words, in a regular expression all characters matter, spaces too. +```` + +## Summary + +There exist following character classes: + +- `pattern:\d` -- digits. +- `pattern:\D` -- non-digits. +- `pattern:\s` -- space symbols, tabs, newlines. +- `pattern:\S` -- all but `pattern:\s`. +- `pattern:\w` -- Latin letters, digits, underscore `'_'`. +- `pattern:\W` -- all but `pattern:\w`. +- `pattern:.` -- any character if with the regexp `'s'` flag, otherwise any except a newline `\n`. + +...But that's not all! + +Unicode encoding, used by JavaScript for strings, provides many properties for characters, like: which language the letter belongs to (if it's a letter) it is it a punctuation sign, etc. + +We can search by these properties as well. That requires flag `pattern:u`, covered in the next article. diff --git a/9-regular-expressions/03-regexp-character-classes/love-html5-classes.svg b/9-regular-expressions/02-regexp-character-classes/love-html5-classes.svg similarity index 100% rename from 9-regular-expressions/03-regexp-character-classes/love-html5-classes.svg rename to 9-regular-expressions/02-regexp-character-classes/love-html5-classes.svg diff --git a/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/solution.md b/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/solution.md deleted file mode 100644 index 829eda13..00000000 --- a/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/solution.md +++ /dev/null @@ -1,6 +0,0 @@ - -The answer: `pattern:\b\d\d:\d\d\b`. - -```js run -alert( "Breakfast at 09:00 in the room 123:456.".match( /\b\d\d:\d\d\b/ ) ); // 09:00 -``` diff --git a/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/task.md b/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/task.md deleted file mode 100644 index 5e32b9c4..00000000 --- a/9-regular-expressions/03-regexp-character-classes/1-find-time-hh-mm/task.md +++ /dev/null @@ -1,8 +0,0 @@ -# Find the time - -The time has a format: `hours:minutes`. Both hours and minutes has two digits, like `09:00`. - -Make a regexp to find time in the string: `subject:Breakfast at 09:00 in the room 123:456.` - -P.S. In this task there's no need to check time correctness yet, so `25:99` can also be a valid result. -P.P.S. The regexp shouldn't match `123:456`. diff --git a/9-regular-expressions/03-regexp-character-classes/article.md b/9-regular-expressions/03-regexp-character-classes/article.md deleted file mode 100644 index 8e18df91..00000000 --- a/9-regular-expressions/03-regexp-character-classes/article.md +++ /dev/null @@ -1,270 +0,0 @@ -# Character classes - -Consider a practical task -- we have a phone number `"+7(903)-123-45-67"`, and we need to turn it into pure numbers: `79035419441`. - -To do so, we can find and remove anything that's not a number. Character classes can help with that. - -A character class is a special notation that matches any symbol from a certain set. - -For the start, let's explore a "digit" class. It's written as `\d`. We put it in the pattern, that means "any single digit". - -For instance, the let's find the first digit in the phone number: - -```js run -let str = "+7(903)-123-45-67"; - -let reg = /\d/; - -alert( str.match(reg) ); // 7 -``` - -Without the flag `g`, the regular expression only looks for the first match, that is the first digit `\d`. - -Let's add the `g` flag to find all digits: - -```js run -let str = "+7(903)-123-45-67"; - -let reg = /\d/g; - -alert( str.match(reg) ); // array of matches: 7,9,0,3,1,2,3,4,5,6,7 - -alert( str.match(reg).join('') ); // 79035419441 -``` - -That was a character class for digits. There are other character classes as well. - -Most used are: - -`\d` ("d" is from "digit") -: A digit: a character from `0` to `9`. - -`\s` ("s" is from "space") -: A space symbol: that includes spaces, tabs, newlines. - -`\w` ("w" is from "word") -: A "wordly" character: either a letter of English alphabet or a digit or an underscore. Non-Latin letters (like cyrillic or hindi) do not belong to `\w`. - -For instance, `pattern:\d\s\w` means a "digit" followed by a "space character" followed by a "wordly character", like `"1 a"`. - -**A regexp may contain both regular symbols and character classes.** - -For instance, `pattern:CSS\d` matches a string `match:CSS` with a digit after it: - -```js run -let str = "CSS4 is cool"; -let reg = /CSS\d/ - -alert( str.match(reg) ); // CSS4 -``` - -Also we can use many character classes: - -```js run -alert( "I love HTML5!".match(/\s\w\w\w\w\d/) ); // ' HTML5' -``` - -The match (each character class corresponds to one result character): - -![](love-html5-classes.svg) - -## Word boundary: \b - -A word boundary `pattern:\b` -- is a special character class. - -It does not denote a character, but rather a boundary between characters. - -For instance, `pattern:\bJava\b` matches `match:Java` in the string `subject:Hello, Java!`, but not in the script `subject:Hello, JavaScript!`. - -```js run -alert( "Hello, Java!".match(/\bJava\b/) ); // Java -alert( "Hello, JavaScript!".match(/\bJava\b/) ); // null -``` - -The boundary has "zero width" in a sense that usually a character class means a character in the result (like a wordly character or a digit), but not in this case. - -The boundary is a test. - -When regular expression engine is doing the search, it's moving along the string in an attempt to find the match. At each string position it tries to find the pattern. - -When the pattern contains `pattern:\b`, it tests that the position in string is a word boundary, that is one of three variants: - -There are three different positions that qualify as word boundaries: - -- At string start, if the first string character is a word character `\w`. -- Between two characters in the string, where one is a word character `\w` and the other is not. -- At string end, if the last string character is a word character `\w`. - -For instance, in the string `subject:Hello, Java!` the following positions match `\b`: - -![](hello-java-boundaries.svg) - -So it matches `pattern:\bHello\b`, because: - -1. At the beginning of the string the first `\b` test matches. -2. Then the word `Hello` matches. -3. Then `\b` matches, as we're between `o` (a word character) and a space (not a word character). - -Pattern `pattern:\bJava\b` also matches. But not `pattern:\bHell\b` (because there's no word boundary after `l`) and not `Java!\b` (because the exclamation sign is not a wordly character, so there's no word boundary after it). - -```js run -alert( "Hello, Java!".match(/\bHello\b/) ); // Hello -alert( "Hello, Java!".match(/\bJava\b/) ); // Java -alert( "Hello, Java!".match(/\bHell\b/) ); // null (no match) -alert( "Hello, Java!".match(/\bJava!\b/) ); // null (no match) -``` - -Once again let's note that `pattern:\b` makes the searching engine to test for the boundary, so that `pattern:Java\b` finds `match:Java` only when followed by a word boundary, but it does not add a letter to the result. - -Usually we use `\b` to find standalone English words. So that if we want `"Java"` language then `pattern:\bJava\b` finds exactly a standalone word and ignores it when it's a part of another word, e.g. it won't match `match:Java` in `subject:JavaScript`. - -Another example: a regexp `pattern:\b\d\d\b` looks for standalone two-digit numbers. In other words, it requires that before and after `pattern:\d\d` must be a symbol different from `\w` (or beginning/end of the string). - -```js run -alert( "1 23 456 78".match(/\b\d\d\b/g) ); // 23,78 -``` - -```warn header="Word boundary doesn't work for non-Latin alphabets" -The word boundary check `\b` tests for a boundary between `\w` and something else. But `\w` means an English letter (or a digit or an underscore), so the test won't work for other characters (like cyrillic or hieroglyphs). - -Later we'll come by Unicode character classes that allow to solve the similar task for different languages. -``` - - -## Inverse classes - -For every character class there exists an "inverse class", denoted with the same letter, but uppercased. - -The "reverse" means that it matches all other characters, for instance: - -`\D` -: Non-digit: any character except `\d`, for instance a letter. - -`\S` -: Non-space: any character except `\s`, for instance a letter. - -`\W` -: Non-wordly character: anything but `\w`. - -`\B` -: Non-boundary: a test reverse to `\b`. - -In the beginning of the chapter we saw how to get all digits from the phone `subject:+7(903)-123-45-67`. - -One way was to match all digits and join them: - -```js run -let str = "+7(903)-123-45-67"; - -alert( str.match(/\d/g).join('') ); // 79031234567 -``` - -An alternative, shorter way is to find non-digits `\D` and remove them from the string: - - -```js run -let str = "+7(903)-123-45-67"; - -alert( str.replace(/\D/g, "") ); // 79031234567 -``` - -## Spaces are regular characters - -Usually we pay little attention to spaces. For us strings `subject:1-5` and `subject:1 - 5` are nearly identical. - -But if a regexp doesn't take spaces into account, it may fail to work. - -Let's try to find digits separated by a dash: - -```js run -alert( "1 - 5".match(/\d-\d/) ); // null, no match! -``` - -Here we fix it by adding spaces into the regexp `pattern:\d - \d`: - -```js run -alert( "1 - 5".match(/\d - \d/) ); // 1 - 5, now it works -``` - -**A space is a character. Equal in importance with any other character.** - -Of course, spaces in a regexp are needed only if we look for them. Extra spaces (just like any other extra characters) may prevent a match: - -```js run -alert( "1-5".match(/\d - \d/) ); // null, because the string 1-5 has no spaces -``` - -In other words, in a regular expression all characters matter, spaces too. - -## A dot is any character - -The dot `"."` is a special character class that matches "any character except a newline". - -For instance: - -```js run -alert( "Z".match(/./) ); // Z -``` - -Or in the middle of a regexp: - -```js run -let reg = /CS.4/; - -alert( "CSS4".match(reg) ); // CSS4 -alert( "CS-4".match(reg) ); // CS-4 -alert( "CS 4".match(reg) ); // CS 4 (space is also a character) -``` - -Please note that the dot means "any character", but not the "absense of a character". There must be a character to match it: - -```js run -alert( "CS4".match(/CS.4/) ); // null, no match because there's no character for the dot -``` - -### The dotall "s" flag - -Usually a dot doesn't match a newline character. - -For instance, `pattern:A.B` matches `match:A`, and then `match:B` with any character between them, except a newline. - -This doesn't match: - -```js run -alert( "A\nB".match(/A.B/) ); // null (no match) - -// a space character would match, or a letter, but not \n -``` - -Sometimes it's inconvenient, we really want "any character", newline included. - -That's what `s` flag does. If a regexp has it, then the dot `"."` match literally any character: - -```js run -alert( "A\nB".match(/A.B/s) ); // A\nB (match!) -``` - -## Summary - -There exist following character classes: - -- `pattern:\d` -- digits. -- `pattern:\D` -- non-digits. -- `pattern:\s` -- space symbols, tabs, newlines. -- `pattern:\S` -- all but `pattern:\s`. -- `pattern:\w` -- English letters, digits, underscore `'_'`. -- `pattern:\W` -- all but `pattern:\w`. -- `pattern:.` -- any character if with the regexp `'s'` flag, otherwise any except a newline. - -...But that's not all! - -The Unicode encoding, used by JavaScript for strings, provides many properties for characters, like: which language the letter belongs to (if a letter) it is it a punctuation sign, etc. - -Modern JavaScript allows to use these properties in regexps to look for characters, for instance: - -- A cyrillic letter is: `pattern:\p{Script=Cyrillic}` or `pattern:\p{sc=Cyrillic}`. -- A dash (be it a small hyphen `-` or a long dash `—`): `pattern:\p{Dash_Punctuation}` or `pattern:\p{pd}`. -- A currency symbol, such as `$`, `€` or another: `pattern:\p{Currency_Symbol}` or `pattern:\p{sc}`. -- ...And much more. Unicode has a lot of character categories that we can select from. - -These patterns require `'u'` regexp flag to work. More about that in the chapter [](info:regexp-unicode). diff --git a/9-regular-expressions/03-regexp-unicode/article.md b/9-regular-expressions/03-regexp-unicode/article.md new file mode 100644 index 00000000..7a14621b --- /dev/null +++ b/9-regular-expressions/03-regexp-unicode/article.md @@ -0,0 +1,167 @@ +# Unicode: flag "u" and class \p{...} + +JavaScript uses [Unicode encoding](https://en.wikipedia.org/wiki/Unicode) for strings. Most characters are encoding with 2 bytes, but that allows to represent at most 65536 characters. + +That range is not big enough to encode all possible characters, that's why some rare characters are encoded with 4 bytes, for instance like `𝒳` (mathematical X) or `😄` (a smile), some hieroglyphs and so on. + +Here are the unicode values of some characters: + +| Character | Unicode | Bytes count in unicode | +|------------|---------|--------| +| a | `0x0061` | 2 | +| ≈ | `0x2248` | 2 | +|𝒳| `0x1d4b3` | 4 | +|𝒴| `0x1d4b4` | 4 | +|😄| `0x1f604` | 4 | + +So characters like `a` and `≈` occupy 2 bytes, while codes for `𝒳`, `𝒴` and `😄` are longer, they have 4 bytes. + +Long time ago, when JavaScript language was created, Unicode encoding was simpler: there were no 4-byte characters. So, some language features still handle them incorrectly. + +For instance, `length` thinks that here are two characters: + +```js run +alert('😄'.length); // 2 +alert('𝒳'.length); // 2 +``` + +...But we can see that there's only one, right? The point is that `length` treats 4 bytes as two 2-byte characters. That's incorrect, because they must be considered only together (so-called "surrogate pair", you can read about them in the article ). + +By default, regular expressions also treat 4-byte "long characters" as a pair of 2-byte ones. And, as it happens with strings, that may lead to odd results. We'll see that a bit later, in the article . + +Unlike strings, regular expressions have flag `pattern:u` that fixes such problems. With such flag, a regexp handles 4-byte characters correctly. And also Unicode property search becomes available, we'll get to it next. + +## Unicode properties \p{...} + +```warn header="Not supported in Firefox and Edge" +Despite being a part of the standard since 2018, unicode proeprties are not supported in Firefox ([bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1361876)) and Edge ([bug](https://github.com/Microsoft/ChakraCore/issues/2969)). + +There's [XRegExp](http://xregexp.com) library that provides "extended" regular expressions with cross-browser support for unicode properties. +``` + +Every character in Unicode has a lot of properties. They describe what "category" the character belongs to, contain miscellaneous information about it. + +For instance, if a character has `Letter` property, it means that the character belongs to an alphabet (of any language). And `Number` property means that it's a digit: maybe Arabic or Chinese, and so on. + +We can search for characters with a property, written as `pattern:\p{…}`. To use `pattern:\p{…}`, a regular expression must have flag `pattern:u`. + +For instance, `\p{Letter}` denotes a letter in any of language. We can also use `\p{L}`, as `L` is an alias of `Letter`. There are shorter aliases for almost every property. + +In the example below three kinds of letters will be found: English, Georgean and Korean. + +```js run +let str = "A ბ ㄱ"; + +alert( str.match(/\p{L}/gu) ); // A,ბ,ㄱ +alert( str.match(/\p{L}/g) ); // null (no matches, as there's no flag "u") +``` + +Here's the main character categories and their subcategories: + +- Letter `L`: + - lowercase `Ll` + - modifier `Lm`, + - titlecase `Lt`, + - uppercase `Lu`, + - other `Lo`. +- Number `N`: + - decimal digit `Nd`, + - letter number `Nl`, + - other `No`. +- Punctuation `P`: + - connector `Pc`, + - dash `Pd`, + - initial quote `Pi`, + - final quote `Pf`, + - open `Ps`, + - close `Pe`, + - other `Po`. +- Mark `M` (accents etc): + - spacing combining `Mc`, + - enclosing `Me`, + - non-spacing `Mn`. +- Symbol `S`: + - currency `Sc`, + - modifier `Sk`, + - math `Sm`, + - other `So`. +- Separator `Z`: + - line `Zl`, + - paragraph `Zp`, + - space `Zs`. +- Other `C`: + - control `Cc`, + - format `Cf`, + - not assigned `Cn`, + -- private use `Co`, + - surrogate `Cs`. + + +So, e.g. if we need letters in lower case, we can write `pattern:\p{Ll}`, punctuation signs: `pattern:\p{P}` and so on. + +There are also other derived categories, like: +- `Alphabetic` (`Alpha`), includes Letters `L`, plus letter numbers `Nl` (e.g. Ⅻ - a character for the roman number 12), plus some other symbols `Other_Alphabetic` (`OAlpha`). +- `Hex_Digit` includes hexadecimal digits: `0-9`, `a-f`. +- ...And so on. + +Unicode supports many different properties, their full list would require a lot of space, so here are the references: + +- List all properties by a character: . +- List all characters by a property: . +- Short aliases for properties: . +- A full base of Unicode characters in text format, with all properties, is here: . + +### Example: hexadecimal numbers + +For instance, let's look for hexadecimal numbers, written as `xFF`, where `F` is a hex digit (0..1 or A..F). + +A hex digit can be denoted as `pattern:\p{Hex_Digit}`: + +```js run +let reg = /x\p{Hex_Digit}\p{Hex_Digit}/u; + +alert("number: xAF".match(reg)); // xAF +``` + +### Example: Chinese hieroglyphs + +Let's look for Chinese hieroglyphs. + +There's a unicode property `Script` (a writing system), that may have a value: `Cyrillic`, `Greek`, `Arabic`, `Han` (Chinese) and so on, [here's the full list]("https://en.wikipedia.org/wiki/Script_(Unicode)"). + +To look for characters in a given writing system we should use `pattern:Script=`, e.g. for Cyrillic letters: `pattern:\p{sc=Cyrillic}`, for Chinese hieroglyphs: `pattern:\p{sc=Han}`, and so on: + +```js run +let regexp = /\p{sc=Han}/gu; // returns Chinese hieroglyphs + +let str = `Hello Привет 你好 123_456`; + +alert( str.match(regexp) ); // 你,好 +``` + +### Example: currency + +Characters that denote a currency, such as `$`, `€`, `¥`, have unicode property `pattern:\p{Currency_Symbol}`, the short alias: `pattern:\p{Sc}`. + +Let's use it to look for prices in the format "currency, followed by a digit": + +```js run +let regexp = /\p{Sc}\d/gu; + +let str = `Prices: $2, €1, ¥9`; + +alert( str.match(regexp) ); // $2,€1,¥9 +``` + +Later, in the article we'll see how to look for numbers that contain many digits. + +## Summary + +Flag `pattern:u` enables the support of Unicode in regular expressions. + +That means two things: + +1. Characters of 4 bytes are handled correctly: as a single character, not two 2-byte characters. +2. Unicode properties can be used in the search: `\p{…}`. + +With Unicode properties we can look for words in given languages, special characters (quotes, currencies) and so on. diff --git a/9-regular-expressions/12-regexp-anchors/1-start-end/solution.md b/9-regular-expressions/04-regexp-anchors/1-start-end/solution.md similarity index 77% rename from 9-regular-expressions/12-regexp-anchors/1-start-end/solution.md rename to 9-regular-expressions/04-regexp-anchors/1-start-end/solution.md index 1a8cbe9a..702f992d 100644 --- a/9-regular-expressions/12-regexp-anchors/1-start-end/solution.md +++ b/9-regular-expressions/04-regexp-anchors/1-start-end/solution.md @@ -1,5 +1,4 @@ - -The empty string is the only match: it starts and immediately finishes. +An empty string is the only match: it starts and immediately finishes. The task once again demonstrates that anchors are not characters, but tests. diff --git a/9-regular-expressions/12-regexp-anchors/1-start-end/task.md b/9-regular-expressions/04-regexp-anchors/1-start-end/task.md similarity index 100% rename from 9-regular-expressions/12-regexp-anchors/1-start-end/task.md rename to 9-regular-expressions/04-regexp-anchors/1-start-end/task.md diff --git a/9-regular-expressions/04-regexp-anchors/article.md b/9-regular-expressions/04-regexp-anchors/article.md new file mode 100644 index 00000000..c34999ee --- /dev/null +++ b/9-regular-expressions/04-regexp-anchors/article.md @@ -0,0 +1,52 @@ +# Anchors: string start ^ and end $ + +The caret `pattern:^` and dollar `pattern:$` characters have special meaning in a regexp. They are called "anchors". + +The caret `pattern:^` matches at the beginning of the text, and the dollar `pattern:$` -- at the end. + +For instance, let's test if the text starts with `Mary`: + +```js run +let str1 = "Mary had a little lamb"; +alert( /^Mary/.test(str1) ); // true +``` + +The pattern `pattern:^Mary` means: "string start and then Mary". + +Similar to this, we can test if the string ends with `snow` using `pattern:snow$`: + +```js run +let str1 = "it's fleece was white as snow"; +alert( /snow$/.test(str1) ); // true +``` + +In these particular cases we could use string methods `startsWith/endsWith` instead. Regular expressions should be used for more complex tests. + +## Testing for a full match + +Both anchors together `pattern:^...$` are often used to test whether or not a string fully matches the pattern. For instance, to check if the user input is in the right format. + +Let's check whether or not a string is a time in `12:34` format. That is: two digits, then a colon, and then another two digits. + +In regular expressions language that's `pattern:\d\d:\d\d`: + +```js run +let goodInput = "12:34"; +let badInput = "12:345"; + +let regexp = /^\d\d:\d\d$/; +alert( regexp.test(goodInput) ); // true +alert( regexp.test(badInput) ); // false +``` + +Here the match for `pattern:\d\d:\d\d` must start exactly after the beginning of the text `pattern:^`, and the end `pattern:$` must immediately follow. + +The whole string must be exactly in this format. If there's any deviation or an extra character, the result is `false`. + +Anchors behave differently if flag `pattern:m` is present. We'll see that in the next article. + +```smart header="Anchors have \"zero width\"" +Anchors `pattern:^` and `pattern:$` are tests. They have zero width. + +In other words, they do not match a character, but rather force the regexp engine to check the condition (text start/end). +``` diff --git a/9-regular-expressions/05-regexp-multiline-mode/article.md b/9-regular-expressions/05-regexp-multiline-mode/article.md new file mode 100644 index 00000000..321218b3 --- /dev/null +++ b/9-regular-expressions/05-regexp-multiline-mode/article.md @@ -0,0 +1,87 @@ +# Multiline mode of anchors ^ $, flag "m" + +The multiline mode is enabled by the flag `pattern:m`. + +It only affects the behavior of `pattern:^` and `pattern:$`. + +In the multiline mode they match not only at the beginning and the end of the string, but also at start/end of line. + +## Searching at line start ^ + +In the example below the text has multiple lines. The pattern `pattern:/^\d/gm` takes a digit from the beginning of each line: + +```js run +let str = `1st place: Winnie +2nd place: Piglet +3rd place: Eeyore`; + +*!* +alert( str.match(/^\d/gm) ); // 1, 2, 3 +*/!* +``` + +Without the flag `pattern:m` only the first digit is matched: + +```js run +let str = `1st place: Winnie +2nd place: Piglet +3rd place: Eeyore`; + +*!* +alert( str.match(/^\d/g) ); // 1 +*/!* +``` + +That's because by default a caret `pattern:^` only matches at the beginning of the text, and in the multiline mode -- at the start of any line. + +```smart +"Start of a line" formally means "immediately after a line break": the test `pattern:^` in multiline mode matches at all positions preceeded by a newline character `\n`. + +And at the text start. +``` + +## Searching at line end $ + +The dollar sign `pattern:$` behaves similarly. + +The regular expression `pattern:\d$` finds the last digit in every line + +```js run +let str = `Winnie: 1 +Piglet: 2 +Eeyore: 3`; + +alert( str.match(/\d$/gm) ); // 1,2,3 +``` + +Without the flag `m`, the dollar `pattern:$` would only match the end of the whole text, so only the very last digit would be found. + +```smart +"End of a line" formally means "immediately before a line break": the test `pattern:^` in multiline mode matches at all positions succeeded by a newline character `\n`. + +And at the text end. +``` + +## Searching for \n instead of ^ $ + +To find a newline, we can use not only anchors `pattern:^` and `pattern:$`, but also the newline character `\n`. + +What's the difference? Let's see an example. + +Here we search for `pattern:\d\n` instead of `pattern:\d$`: + +```js run +let str = `Winnie: 1 +Piglet: 2 +Eeyore: 3`; + +alert( str.match(/\d\n/gm) ); // 1\n,2\n +``` + +As we can see, there are 2 matches instead of 3. + +That's because there's no newline after `subject:3` (there's text end though, so it matches `pattern:$`). + +Another difference: now every match includes a newline character `match:\n`. Unlike the anchors `pattern:^` `pattern:$`, that only test the condition (start/end of a line), `\n` is a character, so it becomes a part of the result. + +So, a `\n` in the pattern is used when we need newline characters in the result, while anchors are used to find something at the beginning/end of a line. diff --git a/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md new file mode 100644 index 00000000..d378d4c9 --- /dev/null +++ b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md @@ -0,0 +1,6 @@ + +Ответ: `pattern:\b\d\d:\d\d\b`. + +```js run +alert( "Завтрак в 09:00 в комнате 123:456.".match( /\b\d\d:\d\d\b/ ) ); // 09:00 +``` diff --git a/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md new file mode 100644 index 00000000..16330a6d --- /dev/null +++ b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md @@ -0,0 +1,9 @@ +# Найдите время + +Время имеет формат: `часы:минуты`. И часы, и минуты имеют две цифры, например, `09:00`. + +Введите регулярное выражение, чтобы найти время в строке: `subject:Завтрак в 09:00 в комнате 123:456.` + +P.S. В этой задаче пока нет необходимости проверять правильность времени, поэтому `25:99` также может быть верным результатом. + +P.P.S. Регулярное выражение не должно находить `123:456`. diff --git a/9-regular-expressions/06-regexp-boundary/article.md b/9-regular-expressions/06-regexp-boundary/article.md new file mode 100644 index 00000000..286a963e --- /dev/null +++ b/9-regular-expressions/06-regexp-boundary/article.md @@ -0,0 +1,53 @@ +# Word boundary: \b + +A word boundary `pattern:\b` is a test, just like `pattern:^` and `pattern:$`. + +When the regexp engine (program module that implements searching for regexps) comes across `pattern:\b`, it checks that the position in the string is a word boundary. + +There are three different positions that qualify as word boundaries: + +- At string start, if the first string character is a word character `pattern:\w`. +- Between two characters in the string, where one is a word character `pattern:\w` and the other is not. +- At string end, if the last string character is a word character `pattern:\w`. + +For instance, regexp `pattern:\bJava\b` will be found in `subject:Hello, Java!`, where `subject:Java` is a standalone word, but not in `subject:Hello, JavaScript!`. + +```js run +alert( "Hello, Java!".match(/\bJava\b/) ); // Java +alert( "Hello, JavaScript!".match(/\bJava\b/) ); // null +``` + +In the string `subject:Hello, Java!` following positions correspond to `pattern:\b`: + +![](hello-java-boundaries.svg) + +So, it matches the pattern `pattern:\bHello\b`, because: + +1. At the beginning of the string matches the first test `pattern:\b`. +2. Then matches the word `pattern:Hello`. +3. Then the test `pattern:\b` - matches again, as we're between `subject:o` and a space. + +Шаблон `pattern:\bJava\b` также совпадёт. Но не `pattern:\bHell\b` (потому что после `subject:l` нет границы слова), и не `pattern:Java!\b` (восклицательный знак не является "символом слова" `pattern:\w`, поэтому после него нет границы слова). + +```js run +alert( "Hello, Java!".match(/\bHello\b/) ); // Hello +alert( "Hello, Java!".match(/\bJava\b/) ); // Java +alert( "Hello, Java!".match(/\bHell\b/) ); // null (нет совпадения) +alert( "Hello, Java!".match(/\bJava!\b/) ); // null (нет совпадения) +``` + +Так как `pattern:\b` является проверкой, то не добавляет символ после границы к результату. + +Мы можем использовать `pattern:\b` не только со словами, но и с цифрами. + +Например, регулярное выражение `pattern:\b\d\d\b` ищет отдельно стоящие двузначные числа. Другими словами, оно требует, чтобы до и после `pattern:\d\d` был символ, отличный от `pattern:\w` (или начало/конец строки) + +```js run +alert( "1 23 456 78".match(/\b\d\d\b/g) ); // 23,78 +``` + +```warn header="Граница слова `pattern:\b` не работает для алфавитов, не основанных на латинице" +Проверка границы слова `pattern:\b` проверяет границу, должно быть `pattern:\w` с одной стороны и "не `pattern:\w`" - с другой. + +Но `pattern:\w` означает латинскую букву (или цифру или знак подчёркивания), поэтому проверка не будет работать для других символов (например, кириллицы или иероглифов). +``` diff --git a/9-regular-expressions/03-regexp-character-classes/hello-java-boundaries.svg b/9-regular-expressions/06-regexp-boundary/hello-java-boundaries.svg similarity index 100% rename from 9-regular-expressions/03-regexp-character-classes/hello-java-boundaries.svg rename to 9-regular-expressions/06-regexp-boundary/hello-java-boundaries.svg diff --git a/9-regular-expressions/04-regexp-escaping/article.md b/9-regular-expressions/07-regexp-escaping/article.md similarity index 96% rename from 9-regular-expressions/04-regexp-escaping/article.md rename to 9-regular-expressions/07-regexp-escaping/article.md index 909cd485..cd118010 100644 --- a/9-regular-expressions/04-regexp-escaping/article.md +++ b/9-regular-expressions/07-regexp-escaping/article.md @@ -75,7 +75,7 @@ The quotes "consume" backslashes and interpret them, for instance: - `\n` -- becomes a newline character, - `\u1234` -- becomes the Unicode character with such code, -- ...And when there's no special meaning: like `\d` or `\z`, then the backslash is simply removed. +- ...And when there's no special meaning: like `pattern:\d` or `\z`, then the backslash is simply removed. So the call to `new RegExp` gets a string without backslashes. That's why the search doesn't work! diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/solution.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/solution.md similarity index 100% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/solution.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/solution.md diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/task.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/task.md similarity index 100% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/1-find-range-1/task.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/1-find-range-1/task.md diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md similarity index 100% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/solution.md diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md similarity index 100% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/2-find-time-2-formats/task.md diff --git a/9-regular-expressions/05-regexp-character-sets-and-ranges/article.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md similarity index 97% rename from 9-regular-expressions/05-regexp-character-sets-and-ranges/article.md rename to 9-regular-expressions/08-regexp-character-sets-and-ranges/article.md index 7204f2b1..3a94125c 100644 --- a/9-regular-expressions/05-regexp-character-sets-and-ranges/article.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md @@ -44,7 +44,7 @@ alert( "Exception 0xAF".match(/x[0-9A-F][0-9A-F]/g) ); // xAF Please note that in the word `subject:Exception` there's a substring `subject:xce`. It didn't match the pattern, because the letters are lowercase, while in the set `pattern:[0-9A-F]` they are uppercase. -If we want to find it too, then we can add a range `a-f`: `pattern:[0-9A-Fa-f]`. The `i` flag would allow lowercase too. +If we want to find it too, then we can add a range `a-f`: `pattern:[0-9A-Fa-f]`. The `pattern:i` flag would allow lowercase too. **Character classes are shorthands for certain character sets.** @@ -58,7 +58,7 @@ We can use character classes inside `[…]` as well. For instance, we want to match all wordly characters or a dash, for words like "twenty-third". We can't do it with `pattern:\w+`, because `pattern:\w` class does not include a dash. But we can use `pattern:[\w-]`. -We also can use several classes, for example `pattern:[\s\S]` matches spaces or non-spaces -- any character. That's wider than a dot `"."`, because the dot matches any character except a newline (unless `s` flag is set). +We also can use several classes, for example `pattern:[\s\S]` matches spaces or non-spaces -- any character. That's wider than a dot `"."`, because the dot matches any character except a newline (unless `pattern:s` flag is set). ## Excluding ranges @@ -69,7 +69,7 @@ They are denoted by a caret character `^` at the start and match any character * For instance: - `pattern:[^aeyo]` -- any character except `'a'`, `'e'`, `'y'` or `'o'`. -- `pattern:[^0-9]` -- any character except a digit, the same as `\D`. +- `pattern:[^0-9]` -- any character except a digit, the same as `pattern:\D`. - `pattern:[^\s]` -- any non-space character, same as `\S`. The example below looks for any characters except letters, digits and spaces: diff --git a/9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/solution.md b/9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/solution.md similarity index 100% rename from 9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/solution.md rename to 9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/solution.md diff --git a/9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/task.md b/9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/task.md similarity index 100% rename from 9-regular-expressions/07-regexp-quantifiers/1-find-text-manydots/task.md rename to 9-regular-expressions/09-regexp-quantifiers/1-find-text-manydots/task.md diff --git a/9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/solution.md b/9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/solution.md similarity index 91% rename from 9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/solution.md rename to 9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/solution.md index 4e85285b..d4d297a1 100644 --- a/9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/solution.md +++ b/9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/solution.md @@ -1,6 +1,6 @@ We need to look for `#` followed by 6 hexadecimal characters. -A hexadecimal character can be described as `pattern:[0-9a-fA-F]`. Or if we use the `i` flag, then just `pattern:[0-9a-f]`. +A hexadecimal character can be described as `pattern:[0-9a-fA-F]`. Or if we use the `pattern:i` flag, then just `pattern:[0-9a-f]`. Then we can look for 6 of them using the quantifier `pattern:{6}`. diff --git a/9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/task.md b/9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/task.md similarity index 100% rename from 9-regular-expressions/07-regexp-quantifiers/2-find-html-colors-6hex/task.md rename to 9-regular-expressions/09-regexp-quantifiers/2-find-html-colors-6hex/task.md diff --git a/9-regular-expressions/07-regexp-quantifiers/article.md b/9-regular-expressions/09-regexp-quantifiers/article.md similarity index 97% rename from 9-regular-expressions/07-regexp-quantifiers/article.md rename to 9-regular-expressions/09-regexp-quantifiers/article.md index 7f382dcc..9b70d722 100644 --- a/9-regular-expressions/07-regexp-quantifiers/article.md +++ b/9-regular-expressions/09-regexp-quantifiers/article.md @@ -2,7 +2,7 @@ Let's say we have a string like `+7(903)-123-45-67` and want to find all numbers in it. But unlike before, we are interested not in single digits, but full numbers: `7, 903, 123, 45, 67`. -A number is a sequence of 1 or more digits `\d`. To mark how many we need, we need to append a *quantifier*. +A number is a sequence of 1 or more digits `pattern:\d`. To mark how many we need, we need to append a *quantifier*. ## Quantity {n} diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/solution.md b/9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/solution.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/solution.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/solution.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/task.md b/9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/task.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/1-lazy-greedy/task.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/1-lazy-greedy/task.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/solution.md b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/solution.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/task.md b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/task.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/3-find-html-comments/task.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/task.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md b/9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/solution.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md b/9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/4-find-html-tags-greedy-lazy/task.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/article.md b/9-regular-expressions/10-regexp-greedy-and-lazy/article.md similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/article.md rename to 9-regular-expressions/10-regexp-greedy-and-lazy/article.md diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy1.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy1.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy1.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy1.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy2.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy2.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy2.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy2.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy3.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy3.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy3.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy3.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy4.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy4.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy4.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy4.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy5.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy5.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy5.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy5.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy6.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy6.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_greedy6.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_greedy6.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy3.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy3.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy3.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy3.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy4.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy4.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy4.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy4.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy5.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy5.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy5.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy5.svg diff --git a/9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy6.svg b/9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy6.svg similarity index 100% rename from 9-regular-expressions/08-regexp-greedy-and-lazy/witch_lazy6.svg rename to 9-regular-expressions/10-regexp-greedy-and-lazy/witch_lazy6.svg diff --git a/9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/solution.md b/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/solution.md rename to 9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md diff --git a/9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/task.md b/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/1-find-webcolor-3-or-6/task.md rename to 9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md diff --git a/9-regular-expressions/09-regexp-groups/2-find-decimal-numbers/solution.md b/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/2-find-decimal-numbers/solution.md rename to 9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md diff --git a/9-regular-expressions/09-regexp-groups/2-find-decimal-numbers/task.md b/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/task.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/2-find-decimal-numbers/task.md rename to 9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/task.md diff --git a/9-regular-expressions/09-regexp-groups/5-parse-expression/solution.md b/9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/5-parse-expression/solution.md rename to 9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md diff --git a/9-regular-expressions/09-regexp-groups/5-parse-expression/task.md b/9-regular-expressions/11-regexp-groups/5-parse-expression/task.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/5-parse-expression/task.md rename to 9-regular-expressions/11-regexp-groups/5-parse-expression/task.md diff --git a/9-regular-expressions/09-regexp-groups/article.md b/9-regular-expressions/11-regexp-groups/article.md similarity index 100% rename from 9-regular-expressions/09-regexp-groups/article.md rename to 9-regular-expressions/11-regexp-groups/article.md diff --git a/9-regular-expressions/09-regexp-groups/regexp-nested-groups.svg b/9-regular-expressions/11-regexp-groups/regexp-nested-groups.svg similarity index 100% rename from 9-regular-expressions/09-regexp-groups/regexp-nested-groups.svg rename to 9-regular-expressions/11-regexp-groups/regexp-nested-groups.svg diff --git a/9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md b/9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md deleted file mode 100644 index 422bc65e..00000000 --- a/9-regular-expressions/12-regexp-anchors/2-test-mac/solution.md +++ /dev/null @@ -1,21 +0,0 @@ -A two-digit hex number is `pattern:[0-9a-f]{2}` (assuming the `pattern:i` flag is enabled). - -We need that number `NN`, and then `:NN` repeated 5 times (more numbers); - -The regexp is: `pattern:[0-9a-f]{2}(:[0-9a-f]{2}){5}` - -Now let's show that the match should capture all the text: start at the beginning and end at the end. That's done by wrapping the pattern in `pattern:^...$`. - -Finally: - -```js run -let reg = /^[0-9a-fA-F]{2}(:[0-9a-fA-F]{2}){5}$/i; - -alert( reg.test('01:32:54:67:89:AB') ); // true - -alert( reg.test('0132546789AB') ); // false (no colons) - -alert( reg.test('01:32:54:67:89') ); // false (5 numbers, need 6) - -alert( reg.test('01:32:54:67:89:ZZ') ) // false (ZZ in the end) -``` diff --git a/9-regular-expressions/12-regexp-anchors/2-test-mac/task.md b/9-regular-expressions/12-regexp-anchors/2-test-mac/task.md deleted file mode 100644 index e7265598..00000000 --- a/9-regular-expressions/12-regexp-anchors/2-test-mac/task.md +++ /dev/null @@ -1,20 +0,0 @@ -# Check MAC-address - -[MAC-address](https://en.wikipedia.org/wiki/MAC_address) of a network interface consists of 6 two-digit hex numbers separated by a colon. - -For instance: `subject:'01:32:54:67:89:AB'`. - -Write a regexp that checks whether a string is MAC-address. - -Usage: -```js -let reg = /your regexp/; - -alert( reg.test('01:32:54:67:89:AB') ); // true - -alert( reg.test('0132546789AB') ); // false (no colons) - -alert( reg.test('01:32:54:67:89') ); // false (5 numbers, must be 6) - -alert( reg.test('01:32:54:67:89:ZZ') ) // false (ZZ ad the end) -``` diff --git a/9-regular-expressions/12-regexp-anchors/article.md b/9-regular-expressions/12-regexp-anchors/article.md deleted file mode 100644 index 0c2dd578..00000000 --- a/9-regular-expressions/12-regexp-anchors/article.md +++ /dev/null @@ -1,55 +0,0 @@ -# String start ^ and finish $ - -The caret `pattern:'^'` and dollar `pattern:'$'` characters have special meaning in a regexp. They are called "anchors". - -The caret `pattern:^` matches at the beginning of the text, and the dollar `pattern:$` -- in the end. - -For instance, let's test if the text starts with `Mary`: - -```js run -let str1 = "Mary had a little lamb, it's fleece was white as snow"; -let str2 = 'Everywhere Mary went, the lamp was sure to go'; - -alert( /^Mary/.test(str1) ); // true -alert( /^Mary/.test(str2) ); // false -``` - -The pattern `pattern:^Mary` means: "the string start and then Mary". - -Now let's test whether the text ends with an email. - -To match an email, we can use a regexp `pattern:[-.\w]+@([\w-]+\.)+[\w-]{2,20}`. - -To test whether the string ends with the email, let's add `pattern:$` to the pattern: - -```js run -let reg = /[-.\w]+@([\w-]+\.)+[\w-]{2,20}$/g; - -let str1 = 'My email is mail@site.com'; -let str2 = 'Everywhere Mary went, the lamp was sure to go'; - -alert( reg.test(str1) ); // true -alert( reg.test(str2) ); // false -``` - -We can use both anchors together to check whether the string exactly follows the pattern. That's often used for validation. - -For instance we want to check that `str` is exactly a color in the form `#` plus 6 hex digits. The pattern for the color is `pattern:#[0-9a-f]{6}`. - -To check that the *whole string* exactly matches it, we add `pattern:^...$`: - -```js run -let str = "#abcdef"; - -alert( /^#[0-9a-f]{6}$/i.test(str) ); // true -``` - -The regexp engine looks for the text start, then the color, and then immediately the text end. Just what we need. - -```smart header="Anchors have zero length" -Anchors just like `\b` are tests. They have zero-width. - -In other words, they do not match a character, but rather force the regexp engine to check the condition (text start/end). -``` - -The behavior of anchors changes if there's a flag `pattern:m` (multiline mode). We'll explore it in the next chapter. diff --git a/9-regular-expressions/10-regexp-backreferences/article.md b/9-regular-expressions/12-regexp-backreferences/article.md similarity index 100% rename from 9-regular-expressions/10-regexp-backreferences/article.md rename to 9-regular-expressions/12-regexp-backreferences/article.md diff --git a/9-regular-expressions/11-regexp-alternation/01-find-programming-language/solution.md b/9-regular-expressions/13-regexp-alternation/01-find-programming-language/solution.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/01-find-programming-language/solution.md rename to 9-regular-expressions/13-regexp-alternation/01-find-programming-language/solution.md diff --git a/9-regular-expressions/11-regexp-alternation/01-find-programming-language/task.md b/9-regular-expressions/13-regexp-alternation/01-find-programming-language/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/01-find-programming-language/task.md rename to 9-regular-expressions/13-regexp-alternation/01-find-programming-language/task.md diff --git a/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/solution.md b/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/solution.md similarity index 79% rename from 9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/solution.md rename to 9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/solution.md index e448a4b1..dddaf962 100644 --- a/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/solution.md +++ b/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/solution.md @@ -1,7 +1,7 @@ Opening tag is `pattern:\[(b|url|quote)\]`. -Then to find everything till the closing tag -- let's use the pattern `pattern:.*?` with flag `s` to match any character including the newline and then add a backreference to the closing tag. +Then to find everything till the closing tag -- let's use the pattern `pattern:.*?` with flag `pattern:s` to match any character including the newline and then add a backreference to the closing tag. The full pattern: `pattern:\[(b|url|quote)\].*?\[/\1\]`. diff --git a/9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/task.md b/9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/02-find-matching-bbtags/task.md rename to 9-regular-expressions/13-regexp-alternation/02-find-matching-bbtags/task.md diff --git a/9-regular-expressions/11-regexp-alternation/03-match-quoted-string/solution.md b/9-regular-expressions/13-regexp-alternation/03-match-quoted-string/solution.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/03-match-quoted-string/solution.md rename to 9-regular-expressions/13-regexp-alternation/03-match-quoted-string/solution.md diff --git a/9-regular-expressions/11-regexp-alternation/03-match-quoted-string/task.md b/9-regular-expressions/13-regexp-alternation/03-match-quoted-string/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/03-match-quoted-string/task.md rename to 9-regular-expressions/13-regexp-alternation/03-match-quoted-string/task.md diff --git a/9-regular-expressions/11-regexp-alternation/04-match-exact-tag/solution.md b/9-regular-expressions/13-regexp-alternation/04-match-exact-tag/solution.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/04-match-exact-tag/solution.md rename to 9-regular-expressions/13-regexp-alternation/04-match-exact-tag/solution.md diff --git a/9-regular-expressions/11-regexp-alternation/04-match-exact-tag/task.md b/9-regular-expressions/13-regexp-alternation/04-match-exact-tag/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/04-match-exact-tag/task.md rename to 9-regular-expressions/13-regexp-alternation/04-match-exact-tag/task.md diff --git a/9-regular-expressions/11-regexp-alternation/article.md b/9-regular-expressions/13-regexp-alternation/article.md similarity index 100% rename from 9-regular-expressions/11-regexp-alternation/article.md rename to 9-regular-expressions/13-regexp-alternation/article.md diff --git a/9-regular-expressions/13-regexp-multiline-mode/article.md b/9-regular-expressions/13-regexp-multiline-mode/article.md deleted file mode 100644 index 955d9601..00000000 --- a/9-regular-expressions/13-regexp-multiline-mode/article.md +++ /dev/null @@ -1,75 +0,0 @@ -# Multiline mode, flag "m" - -The multiline mode is enabled by the flag `pattern:/.../m`. - -It only affects the behavior of `pattern:^` and `pattern:$`. - -In the multiline mode they match not only at the beginning and end of the string, but also at start/end of line. - -## Line start ^ - -In the example below the text has multiple lines. The pattern `pattern:/^\d+/gm` takes a number from the beginning of each one: - -```js run -let str = `1st place: Winnie -2nd place: Piglet -33rd place: Eeyore`; - -*!* -alert( str.match(/^\d+/gm) ); // 1, 2, 33 -*/!* -``` - -The regexp engine moves along the text and looks for a line start `pattern:^`, when finds -- continues to match the rest of the pattern `pattern:\d+`. - -Without the flag `pattern:/.../m` only the first number is matched: - -```js run -let str = `1st place: Winnie -2nd place: Piglet -33rd place: Eeyore`; - -*!* -alert( str.match(/^\d+/g) ); // 1 -*/!* -``` - -That's because by default a caret `pattern:^` only matches at the beginning of the text, and in the multiline mode -- at the start of any line. - -## Line end $ - -The dollar sign `pattern:$` behaves similarly. - -The regular expression `pattern:\w+$` finds the last word in every line - -```js run -let str = `1st place: Winnie -2nd place: Piglet -33rd place: Eeyore`; - -alert( str.match(/\w+$/gim) ); // Winnie,Piglet,Eeyore -``` - -Without the `pattern:/.../m` flag the dollar `pattern:$` would only match the end of the whole string, so only the very last word would be found. - -## Anchors ^$ versus \n - -To find a newline, we can use not only `pattern:^` and `pattern:$`, but also the newline character `\n`. - -The first difference is that unlike anchors, the character `\n` "consumes" the newline character and adds it to the result. - -For instance, here we use it instead of `pattern:$`: - -```js run -let str = `1st place: Winnie -2nd place: Piglet -33rd place: Eeyore`; - -alert( str.match(/\w+\n/gim) ); // Winnie\n,Piglet\n -``` - -Here every match is a word plus a newline character. - -And one more difference -- the newline `\n` does not match at the string end. That's why `Eeyore` is not found in the example above. - -So, anchors are usually better, they are closer to what we want to get. diff --git a/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md b/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md index e877cae4..8e36fb0b 100644 --- a/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md +++ b/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md @@ -101,9 +101,9 @@ Lookaround types: | Pattern | type | matches | |--------------------|------------------|---------| -| `pattern:x(?=y)` | Positive lookahead | `x` if followed by `y` | -| `pattern:x(?!y)` | Negative lookahead | `x` if not followed by `y` | -| `pattern:(?<=y)x` | Positive lookbehind | `x` if after `y` | -| `pattern:(?. - -Let's briefly review them here. In short, normally characters are encoded with 2 bytes. That gives us 65536 characters maximum. But there are more characters in the world. - -So certain rare characters are encoded with 4 bytes, like `𝒳` (mathematical X) or `😄` (a smile). - -Here are the unicode values to compare: - -| Character | Unicode | Bytes | -|------------|---------|--------| -| `a` | 0x0061 | 2 | -| `≈` | 0x2248 | 2 | -|`𝒳`| 0x1d4b3 | 4 | -|`𝒴`| 0x1d4b4 | 4 | -|`😄`| 0x1f604 | 4 | - -So characters like `a` and `≈` occupy 2 bytes, and those rare ones take 4. - -The unicode is made in such a way that the 4-byte characters only have a meaning as a whole. - -In the past JavaScript did not know about that, and many string methods still have problems. For instance, `length` thinks that here are two characters: - -```js run -alert('😄'.length); // 2 -alert('𝒳'.length); // 2 -``` - -...But we can see that there's only one, right? The point is that `length` treats 4 bytes as two 2-byte characters. That's incorrect, because they must be considered only together (so-called "surrogate pair"). - -Normally, regular expressions also treat "long characters" as two 2-byte ones. - -That leads to odd results, for instance let's try to find `pattern:[𝒳𝒴]` in the string `subject:𝒳`: - -```js run -alert( '𝒳'.match(/[𝒳𝒴]/) ); // odd result (wrong match actually, "half-character") -``` - -The result is wrong, because by default the regexp engine does not understand surrogate pairs. - -So, it thinks that `[𝒳𝒴]` are not two, but four characters: -1. the left half of `𝒳` `(1)`, -2. the right half of `𝒳` `(2)`, -3. the left half of `𝒴` `(3)`, -4. the right half of `𝒴` `(4)`. - -We can list them like this: - -```js run -for(let i=0; i<'𝒳𝒴'.length; i++) { - alert('𝒳𝒴'.charCodeAt(i)); // 55349, 56499, 55349, 56500 -}; -``` - -So it finds only the "left half" of `𝒳`. - -In other words, the search works like `'12'.match(/[1234]/)`: only `1` is returned. - -## The "u" flag - -The `/.../u` flag fixes that. - -It enables surrogate pairs in the regexp engine, so the result is correct: - -```js run -alert( '𝒳'.match(/[𝒳𝒴]/u) ); // 𝒳 -``` - -Let's see one more example. - -If we forget the `u` flag and accidentally use surrogate pairs, then we can get an error: - -```js run -'𝒳'.match(/[𝒳-𝒴]/); // SyntaxError: invalid range in character class -``` - -Normally, regexps understand `[a-z]` as a "range of characters with codes between codes of `a` and `z`. - -But without `u` flag, surrogate pairs are assumed to be a "pair of independent characters", so `[𝒳-𝒴]` is like `[<55349><56499>-<55349><56500>]` (replaced each surrogate pair with code points). Now we can clearly see that the range `56499-55349` is unacceptable, as the left range border must be less than the right one. - -Using the `u` flag makes it work right: - -```js run -alert( '𝒴'.match(/[𝒳-𝒵]/u) ); // 𝒴 -``` diff --git a/9-regular-expressions/21-regexp-unicode-properties/article.md b/9-regular-expressions/21-regexp-unicode-properties/article.md deleted file mode 100644 index 2bb031d7..00000000 --- a/9-regular-expressions/21-regexp-unicode-properties/article.md +++ /dev/null @@ -1,86 +0,0 @@ - -# Unicode character properties \p - -[Unicode](https://en.wikipedia.org/wiki/Unicode), the encoding format used by JavaScript strings, has a lot of properties for different characters (or, technically, code points). They describe which "categories" character belongs to, and a variety of technical details. - -In regular expressions these can be set by `\p{…}`. And there must be flag `'u'`. - -For instance, `\p{Letter}` denotes a letter in any of language. We can also use `\p{L}`, as `L` is an alias of `Letter`, there are shorter aliases for almost every property. - -Here's the main tree of properties: - -- Letter `L`: - - lowercase `Ll`, modifier `Lm`, titlecase `Lt`, uppercase `Lu`, other `Lo` -- Number `N`: - - decimal digit `Nd`, letter number `Nl`, other `No` -- Punctuation `P`: - - connector `Pc`, dash `Pd`, initial quote `Pi`, final quote `Pf`, open `Ps`, close `Pe`, other `Po` -- Mark `M` (accents etc): - - spacing combining `Mc`, enclosing `Me`, non-spacing `Mn` -- Symbol `S`: - - currency `Sc`, modifier `Sk`, math `Sm`, other `So` -- Separator `Z`: - - line `Zl`, paragraph `Zp`, space `Zs` -- Other `C`: - - control `Cc`, format `Cf`, not assigned `Cn`, private use `Co`, surrogate `Cs` - -```smart header="More information" -Interested to see which characters belong to a property? There's a tool at for that. - -You could also explore properties at [Character Property Index](http://unicode.org/cldr/utility/properties.jsp). - -For the full Unicode Character Database in text format (along with all properties), see . -``` - -There are also other derived categories, like: -- `Alphabetic` (`Alpha`), includes Letters `L`, plus letter numbers `Nl` (e.g. roman numbers Ⅻ), plus some other symbols `Other_Alphabetic` (`OAltpa`). -- `Hex_Digit` includes hexadecimal digits: `0-9`, `a-f`. -- ...Unicode is a big beast, it includes a lot of properties. - -For instance, let's look for a 6-digit hex number: - -```js run -let reg = /\p{Hex_Digit}{6}/u; // flag 'u' is required - -alert("color: #123ABC".match(reg)); // 123ABC -``` - -There are also properties with a value. For instance, Unicode "Script" (a writing system) can be Cyrillic, Greek, Arabic, Han (Chinese) etc, the [list is long]("https://en.wikipedia.org/wiki/Script_(Unicode)"). - -To search for characters in certain scripts ("alphabets"), we should supply `Script=`, e.g. to search for cyrillic letters: `\p{sc=Cyrillic}`, for Chinese glyphs: `\p{sc=Han}`, etc: - -```js run -let regexp = /\p{sc=Han}+/gu; // get chinese words - -let str = `Hello Привет 你好 123_456`; - -alert( str.match(regexp) ); // 你好 -``` - -## Building multi-language \w - -The pattern `pattern:\w` means "wordly characters", but doesn't work for languages that use non-Latin alphabets, such as Cyrillic and others. It's just a shorthand for `[a-zA-Z0-9_]`, so `pattern:\w+` won't find any Chinese words etc. - -Let's make a "universal" regexp, that looks for wordly characters in any language. That's easy to do using Unicode properties: - -```js -/[\p{Alphabetic}\p{Mark}\p{Decimal_Number}\p{Connector_Punctuation}\p{Join_Control}]/u -``` - -Let's decipher. Just as `pattern:\w` is the same as `pattern:[a-zA-Z0-9_]`, we're making a set of our own, that includes: - -- `Alphabetic` for letters, -- `Mark` for accents, as in Unicode accents may be represented by separate code points, -- `Decimal_Number` for numbers, -- `Connector_Punctuation` for the `'_'` character and alike, -- `Join_Control` -– two special code points with hex codes `200c` and `200d`, used in ligatures e.g. in arabic. - -Or, if we replace long names with aliases (a list of aliases [here](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)): - -```js run -let regexp = /([\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]+)/gu; - -let str = `Hello Привет 你好 123_456`; - -alert( str.match(regexp) ); // Hello,Привет,你好,123_456 -``` diff --git a/9-regular-expressions/index.md b/9-regular-expressions/index.md index 7499c584..ac25aaa6 100644 --- a/9-regular-expressions/index.md +++ b/9-regular-expressions/index.md @@ -1,7 +1,3 @@ # Regular expressions Regular expressions is a powerful way of doing search and replace in strings. - -In JavaScript regular expressions are implemented using objects of a built-in `RegExp` class and integrated with strings. - -Please note that regular expressions vary between programming languages. In this tutorial we concentrate on JavaScript. Of course there's a lot in common, but they are a somewhat different in Perl, Ruby, PHP etc. From fc0b18538d7f5117c6f2fbabf026b50660ebe4c7 Mon Sep 17 00:00:00 2001 From: Ilya Kantor Date: Wed, 4 Sep 2019 19:35:17 +0300 Subject: [PATCH 2/7] WIP --- .../1-find-time-hh-mm/solution.md | 4 +-- .../1-find-time-hh-mm/task.md | 10 +++---- .../06-regexp-boundary/article.md | 21 +++++++-------- .../07-regexp-escaping/article.md | 26 +++++++++---------- 4 files changed, 30 insertions(+), 31 deletions(-) diff --git a/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md index d378d4c9..829eda13 100644 --- a/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md +++ b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/solution.md @@ -1,6 +1,6 @@ -Ответ: `pattern:\b\d\d:\d\d\b`. +The answer: `pattern:\b\d\d:\d\d\b`. ```js run -alert( "Завтрак в 09:00 в комнате 123:456.".match( /\b\d\d:\d\d\b/ ) ); // 09:00 +alert( "Breakfast at 09:00 in the room 123:456.".match( /\b\d\d:\d\d\b/ ) ); // 09:00 ``` diff --git a/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md index 16330a6d..95ab5777 100644 --- a/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md +++ b/9-regular-expressions/06-regexp-boundary/1-find-time-hh-mm/task.md @@ -1,9 +1,9 @@ -# Найдите время +# Find the time -Время имеет формат: `часы:минуты`. И часы, и минуты имеют две цифры, например, `09:00`. +The time has a format: `hours:minutes`. Both hours and minutes has two digits, like `09:00`. -Введите регулярное выражение, чтобы найти время в строке: `subject:Завтрак в 09:00 в комнате 123:456.` +Make a regexp to find time in the string: `subject:Breakfast at 09:00 in the room 123:456.` -P.S. В этой задаче пока нет необходимости проверять правильность времени, поэтому `25:99` также может быть верным результатом. +P.S. In this task there's no need to check time correctness yet, so `25:99` can also be a valid result. -P.P.S. Регулярное выражение не должно находить `123:456`. +P.P.S. The regexp shouldn't match `123:456`. diff --git a/9-regular-expressions/06-regexp-boundary/article.md b/9-regular-expressions/06-regexp-boundary/article.md index 286a963e..e4df252a 100644 --- a/9-regular-expressions/06-regexp-boundary/article.md +++ b/9-regular-expressions/06-regexp-boundary/article.md @@ -25,29 +25,28 @@ So, it matches the pattern `pattern:\bHello\b`, because: 1. At the beginning of the string matches the first test `pattern:\b`. 2. Then matches the word `pattern:Hello`. -3. Then the test `pattern:\b` - matches again, as we're between `subject:o` and a space. +3. Then the test `pattern:\b` matches again, as we're between `subject:o` and a space. -Шаблон `pattern:\bJava\b` также совпадёт. Но не `pattern:\bHell\b` (потому что после `subject:l` нет границы слова), и не `pattern:Java!\b` (восклицательный знак не является "символом слова" `pattern:\w`, поэтому после него нет границы слова). +The pattern `pattern:\bJava\b` would also match. But not `pattern:\bHell\b` (because there's no word boundary after `l`) and not `Java!\b` (because the exclamation sign is not a wordly character `pattern:\w`, so there's no word boundary after it). ```js run alert( "Hello, Java!".match(/\bHello\b/) ); // Hello alert( "Hello, Java!".match(/\bJava\b/) ); // Java -alert( "Hello, Java!".match(/\bHell\b/) ); // null (нет совпадения) -alert( "Hello, Java!".match(/\bJava!\b/) ); // null (нет совпадения) +alert( "Hello, Java!".match(/\bHell\b/) ); // null (no match) +alert( "Hello, Java!".match(/\bJava!\b/) ); // null (no match) ``` -Так как `pattern:\b` является проверкой, то не добавляет символ после границы к результату. +We can use `pattern:\b` not only with words, but with digits as well. -Мы можем использовать `pattern:\b` не только со словами, но и с цифрами. - -Например, регулярное выражение `pattern:\b\d\d\b` ищет отдельно стоящие двузначные числа. Другими словами, оно требует, чтобы до и после `pattern:\d\d` был символ, отличный от `pattern:\w` (или начало/конец строки) +For example, the pattern `pattern:\b\d\d\b` looks for standalone 2-digit numbers. In other words, it looks for 2-digit numbers that are surrounded by characters different from `pattern:\w`, such as spaces or punctuation (or text start/end). ```js run alert( "1 23 456 78".match(/\b\d\d\b/g) ); // 23,78 +alert( "12,34,56".match(/\b\d\d\b/g) ); // 12,34,56 ``` -```warn header="Граница слова `pattern:\b` не работает для алфавитов, не основанных на латинице" -Проверка границы слова `pattern:\b` проверяет границу, должно быть `pattern:\w` с одной стороны и "не `pattern:\w`" - с другой. +```warn header="Word boundary `pattern:\b` doesn't work for non-latin alphabets" +The word boundary test `pattern:\b` checks that there should be `pattern:\w` on the one side from the position and "not `pattern:\w`" - on the other side. -Но `pattern:\w` означает латинскую букву (или цифру или знак подчёркивания), поэтому проверка не будет работать для других символов (например, кириллицы или иероглифов). +But `pattern:\w` means a latin letter `a-z` (or a digit or an underscore), so the test doesn't work for other characters, e.g. cyrillic letters or hieroglyphs. ``` diff --git a/9-regular-expressions/07-regexp-escaping/article.md b/9-regular-expressions/07-regexp-escaping/article.md index cd118010..5169bd55 100644 --- a/9-regular-expressions/07-regexp-escaping/article.md +++ b/9-regular-expressions/07-regexp-escaping/article.md @@ -1,7 +1,7 @@ # Escaping, special characters -As we've seen, a backslash `"\"` is used to denote character classes. So it's a special character in regexps (just like in a regular string). +As we've seen, a backslash `pattern:\` is used to denote character classes, e.g. `pattern:\d`. So it's a special character in regexps (just like in regular strings). There are other special characters as well, that have special meaning in a regexp. They are used to do more powerful searches. Here's a full list of them: `pattern:[ \ ^ $ . | ? * + ( )`. @@ -9,7 +9,7 @@ Don't try to remember the list -- soon we'll deal with each of them separately a ## Escaping -Let's say we want to find a dot literally. Not "any character", but just a dot. +Let's say we want to find literally a dot. Not "any character", but just a dot. To use a special character as a regular one, prepend it with a backslash: `pattern:\.`. @@ -43,11 +43,11 @@ Here's what a search for a slash `'/'` looks like: alert( "/".match(/\//) ); // '/' ``` -On the other hand, if we're not using `/.../`, but create a regexp using `new RegExp`, then we don't need to escape it: +On the other hand, if we're not using `pattern:/.../`, but create a regexp using `new RegExp`, then we don't need to escape it: ```js run -alert( "/".match(new RegExp("/")) ); // '/' -``` +alert( "/".match(new RegExp("/")) ); // finds / +``` ## new RegExp @@ -61,25 +61,25 @@ let reg = new RegExp("\d\.\d"); alert( "Chapter 5.1".match(reg) ); // null ``` -The search worked with `pattern:/\d\.\d/`, but with `new RegExp("\d\.\d")` it doesn't work, why? +The similar search in one of previous examples worked with `pattern:/\d\.\d/`, but `new RegExp("\d\.\d")` doesn't work, why? -The reason is that backslashes are "consumed" by a string. Remember, regular strings have their own special characters like `\n`, and a backslash is used for escaping. +The reason is that backslashes are "consumed" by a string. As we may recall, regular strings have their own special characters, such as `\n`, and a backslash is used for escaping. -Please, take a look, what "\d\.\d" really is: +Here's how "\d\.\d" is preceived: ```js run alert("\d\.\d"); // d.d ``` -The quotes "consume" backslashes and interpret them, for instance: +String quotes "consume" backslashes and interpret them on their own, for instance: - `\n` -- becomes a newline character, - `\u1234` -- becomes the Unicode character with such code, - ...And when there's no special meaning: like `pattern:\d` or `\z`, then the backslash is simply removed. -So the call to `new RegExp` gets a string without backslashes. That's why the search doesn't work! +So `new RegExp` gets a string without backslashes. That's why the search doesn't work! -To fix it, we need to double backslashes, because quotes turn `\\` into `\`: +To fix it, we need to double backslashes, because string quotes turn `\\` into `\`: ```js run *!* @@ -94,6 +94,6 @@ alert( "Chapter 5.1".match(reg) ); // 5.1 ## Summary -- To search special characters `pattern:[ \ ^ $ . | ? * + ( )` literally, we need to prepend them with `\` ("escape them"). +- To search for special characters `pattern:[ \ ^ $ . | ? * + ( )` literally, we need to prepend them with a backslash `\` ("escape them"). - We also need to escape `/` if we're inside `pattern:/.../` (but not inside `new RegExp`). -- When passing a string `new RegExp`, we need to double backslashes `\\`, cause strings consume one of them. +- When passing a string `new RegExp`, we need to double backslashes `\\`, cause string quotes consume one of them. From 20547570ff77a116ad8769006b809b126007f92f Mon Sep 17 00:00:00 2001 From: Ilya Kantor Date: Thu, 5 Sep 2019 14:57:06 +0300 Subject: [PATCH 3/7] WIP --- .../02-regexp-character-classes/article.md | 2 +- .../article.md | 121 ++++++-- .../09-regexp-quantifiers/article.md | 64 ++-- .../3-find-html-comments/solution.md | 8 +- .../10-regexp-greedy-and-lazy/article.md | 45 ++- .../1-find-webcolor-3-or-6/solution.md | 8 +- .../1-find-webcolor-3-or-6/task.md | 2 +- .../2-find-decimal-numbers/solution.md | 2 +- .../5-parse-expression/solution.md | 19 +- .../11-regexp-groups/article.md | 290 ++++++++++++------ ...s.svg => regexp-nested-groups-matches.svg} | 0 .../regexp-nested-groups-pattern.svg | 1 + 12 files changed, 376 insertions(+), 186 deletions(-) rename 9-regular-expressions/11-regexp-groups/{regexp-nested-groups.svg => regexp-nested-groups-matches.svg} (100%) create mode 100644 9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg diff --git a/9-regular-expressions/02-regexp-character-classes/article.md b/9-regular-expressions/02-regexp-character-classes/article.md index 881b6ba2..34240b6e 100644 --- a/9-regular-expressions/02-regexp-character-classes/article.md +++ b/9-regular-expressions/02-regexp-character-classes/article.md @@ -41,7 +41,7 @@ Most used are: : A digit: a character from `0` to `9`. `pattern:\s` ("s" is from "space") -: A space symbol: includes spaces, tabs `\t`, newlines `\n` and few other rare characters: `\v`, `\f` and `\r`. +: A space symbol: includes spaces, tabs `\t`, newlines `\n` and few other rare characters, such as `\v`, `\f` and `\r`. `pattern:\w` ("w" is from "word") : A "wordly" character: either a letter of Latin alphabet or a digit or an underscore `_`. Non-Latin letters (like cyrillic or hindi) do not belong to `pattern:\w`. diff --git a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md index 3a94125c..6beca62b 100644 --- a/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md +++ b/9-regular-expressions/08-regexp-character-sets-and-ranges/article.md @@ -22,7 +22,7 @@ So the example below gives no matches: alert( "Voila".match(/V[oi]la/) ); // null, no matches ``` -The pattern assumes: +The pattern searches for: - `pattern:V`, - then *one* of the letters `pattern:[oi]`, @@ -42,23 +42,56 @@ In the example below we're searching for `"x"` followed by two digits or letters alert( "Exception 0xAF".match(/x[0-9A-F][0-9A-F]/g) ); // xAF ``` -Please note that in the word `subject:Exception` there's a substring `subject:xce`. It didn't match the pattern, because the letters are lowercase, while in the set `pattern:[0-9A-F]` they are uppercase. +Here `pattern:[0-9A-F]` has two ranges: it searches for a character that is either a digit from `0` to `9` or a letter from `A` to `F`. -If we want to find it too, then we can add a range `a-f`: `pattern:[0-9A-Fa-f]`. The `pattern:i` flag would allow lowercase too. +If we'd like to look for lowercase letters as well, we can add the range `a-f`: `pattern:[0-9A-Fa-f]`. Or add the flag `pattern:i`. -**Character classes are shorthands for certain character sets.** +We can also use character classes inside `[…]`. +For instance, if we'd like to look for a wordly character `pattern:\w` or a hyphen `pattern:-`, then the set is `pattern:[\w-]`. + +Combining multiple classes is also possible, e.g. `pattern:[\s\d]` means "a space character or a digit". + +```smart header="Character classes are shorthands for certain character sets" For instance: - **\d** -- is the same as `pattern:[0-9]`, - **\w** -- is the same as `pattern:[a-zA-Z0-9_]`, -- **\s** -- is the same as `pattern:[\t\n\v\f\r ]` plus few other unicode space characters. +- **\s** -- is the same as `pattern:[\t\n\v\f\r ]`, plus few other rare unicode space characters. +``` -We can use character classes inside `[…]` as well. +### Example: multi-language \w -For instance, we want to match all wordly characters or a dash, for words like "twenty-third". We can't do it with `pattern:\w+`, because `pattern:\w` class does not include a dash. But we can use `pattern:[\w-]`. +As the character class `pattern:\w` is a shorthand for `pattern:[a-zA-Z0-9_]`, it can't find Chinese hieroglyphs, Cyrillic letters, etc. -We also can use several classes, for example `pattern:[\s\S]` matches spaces or non-spaces -- any character. That's wider than a dot `"."`, because the dot matches any character except a newline (unless `pattern:s` flag is set). +We can write a more universal pattern, that looks for wordly characters in any language. That's easy with unicode properties: `pattern:[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]`. + +Let's decipher it. Similar to `pattern:\w`, we're making a set of our own that includes characters with following unicode properties: + +- `Alphabetic` (`Alpha`) - for letters, +- `Mark` (`M`) - for accents, +- `Decimal_Number` (`Nd`) - for digits, +- `Connector_Punctuation` (`Pc`) - for the underscore `'_'` and similar characters, +- `Join_Control` (`Join_C`) - two special codes `200c` and `200d`, used in ligatures, e.g. in Arabic. + +An example of use: + +```js run +let regexp = /[\p{Alpha}\p{M}\p{Nd}\p{Pc}\p{Join_C}]/gu; + +let str = `Hi 你好 12`; + +// finds all letters and digits: +alert( str.match(regexp) ); // H,i,你,好,1,2 +``` + +Of course, we can edit this pattern: add unicode properties or remove them. Unicode properties are covered in more details in the article . + +```warn header="Unicode properties aren't supported in Edge and Firefox" +Unicode properties `pattern:p{…}` are not yet implemented in Edge and Firefox. If we really need them, we can use library [XRegExp](http://xregexp.com/). + +Or just use ranges of characters in a language that interests us, e.g. `pattern:[а-я]` for Cyrillic letters. +``` ## Excluding ranges @@ -78,22 +111,20 @@ The example below looks for any characters except letters, digits and spaces: alert( "alice15@gmail.com".match(/[^\d\sA-Z]/gi) ); // @ and . ``` -## No escaping in […] +## Escaping in […] -Usually when we want to find exactly the dot character, we need to escape it like `pattern:\.`. And if we need a backslash, then we use `pattern:\\`. +Usually when we want to find exactly a special character, we need to escape it like `pattern:\.`. And if we need a backslash, then we use `pattern:\\`, and so on. -In square brackets the vast majority of special characters can be used without escaping: +In square brackets we can use the vast majority of special characters without escaping: -- A dot `pattern:'.'`. -- A plus `pattern:'+'`. -- Parentheses `pattern:'( )'`. -- Dash `pattern:'-'` in the beginning or the end (where it does not define a range). -- A caret `pattern:'^'` if not in the beginning (where it means exclusion). -- And the opening square bracket `pattern:'['`. +- Symbols `pattern:. + ( )` never need escaping. +- A hyphen `pattern:-` is not escaped in the beginning or the end (where it does not define a range). +- A caret `pattern:^` is only escaped in the beginning (where it means exclusion). +- The closing square bracket `pattern:]` is always escaped (if we need to look for that symbol). -In other words, all special characters are allowed except where they mean something for square brackets. +In other words, all special characters are allowed without escaping, except when they mean something for square brackets. -A dot `"."` inside square brackets means just a dot. The pattern `pattern:[.,]` would look for one of characters: either a dot or a comma. +A dot `.` inside square brackets means just a dot. The pattern `pattern:[.,]` would look for one of characters: either a dot or a comma. In the example below the regexp `pattern:[-().^+]` looks for one of the characters `-().^+`: @@ -112,3 +143,55 @@ let reg = /[\-\(\)\.\^\+]/g; alert( "1 + 2 - 3".match(reg) ); // also works: +, - ``` + +## Ranges and flag "u" + +If there are surrogate pairs in the set, flag `pattern:u` is required for them to work correctly. + +For instance, let's look for `pattern:[𝒳𝒴]` in the string `subject:𝒳`: + +```js run +alert( '𝒳'.match(/[𝒳𝒴]/) ); // shows a strange character, like [?] +// (the search was performed incorrectly, half-character returned) +``` + +The result is incorrect, because by default regular expressions "don't know" about surrogate pairs. + +The regular expression engine thinks that `[𝒳𝒴]` -- are not two, but four characters: +1. left half of `𝒳` `(1)`, +2. right half of `𝒳` `(2)`, +3. left half of `𝒴` `(3)`, +4. right half of `𝒴` `(4)`. + +We can see their codes like this: + +```js run +for(let i=0; i<'𝒳𝒴'.length; i++) { + alert('𝒳𝒴'.charCodeAt(i)); // 55349, 56499, 55349, 56500 +}; +``` + +So, the example above finds and shows the left half of `𝒳`. + +If we add flag `pattern:u`, then the behavior will be correct: + +```js run +alert( '𝒳'.match(/[𝒳𝒴]/u) ); // 𝒳 +``` + +The similar situation occurs when looking for a range, such as `[𝒳-𝒴]`. + +If we forget to add flag `pattern:u`, there will be an error: + +```js run +'𝒳'.match(/[𝒳-𝒴]/); // Error: Invalid regular expression +``` + +The reason is that without flag `pattern:u` surrogate pairs are perceived as two characters, so `[𝒳-𝒴]` is interpreted as `[<55349><56499>-<55349><56500>]` (every surrogate pair is replaced with its codes). Now it's easy to see that the range `56499-55349` is invalid: its starting code `56499` is greater than the end `55349`. That's the formal reason for the error. + +With the flag `pattern:u` the pattern works correctly: + +```js run +// look for characters from 𝒳 to 𝒵 +alert( '𝒴'.match(/[𝒳-𝒵]/u) ); // 𝒴 +``` diff --git a/9-regular-expressions/09-regexp-quantifiers/article.md b/9-regular-expressions/09-regexp-quantifiers/article.md index 9b70d722..1a7eecfe 100644 --- a/9-regular-expressions/09-regexp-quantifiers/article.md +++ b/9-regular-expressions/09-regexp-quantifiers/article.md @@ -2,7 +2,7 @@ Let's say we have a string like `+7(903)-123-45-67` and want to find all numbers in it. But unlike before, we are interested not in single digits, but full numbers: `7, 903, 123, 45, 67`. -A number is a sequence of 1 or more digits `pattern:\d`. To mark how many we need, we need to append a *quantifier*. +A number is a sequence of 1 or more digits `pattern:\d`. To mark how many we need, we can append a *quantifier*. ## Quantity {n} @@ -12,7 +12,7 @@ A quantifier is appended to a character (or a character class, or a `[...]` set It has a few advanced forms, let's see examples: -The exact count: `{5}` +The exact count: `pattern:{5}` : `pattern:\d{5}` denotes exactly 5 digits, the same as `pattern:\d\d\d\d\d`. The example below looks for a 5-digit number: @@ -23,7 +23,7 @@ The exact count: `{5}` We can add `\b` to exclude longer numbers: `pattern:\b\d{5}\b`. -The range: `{3,5}`, match 3-5 times +The range: `pattern:{3,5}`, match 3-5 times : To find numbers from 3 to 5 digits we can put the limits into curly braces: `pattern:\d{3,5}` ```js run @@ -54,8 +54,8 @@ alert(numbers); // 7,903,123,45,67 There are shorthands for most used quantifiers: -`+` -: Means "one or more", the same as `{1,}`. +`pattern:+` +: Means "one or more", the same as `pattern:{1,}`. For instance, `pattern:\d+` looks for numbers: @@ -65,8 +65,8 @@ There are shorthands for most used quantifiers: alert( str.match(/\d+/g) ); // 7,903,123,45,67 ``` -`?` -: Means "zero or one", the same as `{0,1}`. In other words, it makes the symbol optional. +`pattern:?` +: Means "zero or one", the same as `pattern:{0,1}`. In other words, it makes the symbol optional. For instance, the pattern `pattern:ou?r` looks for `match:o` followed by zero or one `match:u`, and then `match:r`. @@ -78,16 +78,16 @@ There are shorthands for most used quantifiers: alert( str.match(/colou?r/g) ); // color, colour ``` -`*` -: Means "zero or more", the same as `{0,}`. That is, the character may repeat any times or be absent. +`pattern:*` +: Means "zero or more", the same as `pattern:{0,}`. That is, the character may repeat any times or be absent. - For example, `pattern:\d0*` looks for a digit followed by any number of zeroes: + For example, `pattern:\d0*` looks for a digit followed by any number of zeroes (may be many or none): ```js run alert( "100 10 1".match(/\d0*/g) ); // 100, 10, 1 ``` - Compare it with `'+'` (one or more): + Compare it with `pattern:+` (one or more): ```js run alert( "100 10 1".match(/\d0+/g) ); // 100, 10 @@ -98,43 +98,45 @@ There are shorthands for most used quantifiers: Quantifiers are used very often. They serve as the main "building block" of complex regular expressions, so let's see more examples. -Regexp "decimal fraction" (a number with a floating point): `pattern:\d+\.\d+` -: In action: - ```js run - alert( "0 1 12.345 7890".match(/\d+\.\d+/g) ); // 12.345 - ``` +**Regexp for decimal fractions (a number with a floating point): `pattern:\d+\.\d+`** -Regexp "open HTML-tag without attributes", like `` or `

`: `pattern:/<[a-z]+>/i` -: In action: +In action: +```js run +alert( "0 1 12.345 7890".match(/\d+\.\d+/g) ); // 12.345 +``` + +**Regexp for an "opening HTML-tag without attributes", such as `` or `

`.** + +1. The simplest one: `pattern:/<[a-z]+>/i` ```js run alert( " ... ".match(/<[a-z]+>/gi) ); // ``` - We look for character `pattern:'<'` followed by one or more Latin letters, and then `pattern:'>'`. + The regexp looks for character `pattern:'<'` followed by one or more Latin letters, and then `pattern:'>'`. -Regexp "open HTML-tag without attributes" (improved): `pattern:/<[a-z][a-z0-9]*>/i` -: Better regexp: according to the standard, HTML tag name may have a digit at any position except the first one, like `

`. +2. Improved: `pattern:/<[a-z][a-z0-9]*>/i` + + According to the standard, HTML tag name may have a digit at any position except the first one, like `

`. ```js run alert( "

Hi!

".match(/<[a-z][a-z0-9]*>/gi) ); //

``` -Regexp "opening or closing HTML-tag without attributes": `pattern:/<\/?[a-z][a-z0-9]*>/i` -: We added an optional slash `pattern:/?` before the tag. Had to escape it with a backslash, otherwise JavaScript would think it is the pattern end. +**Regexp "opening or closing HTML-tag without attributes": `pattern:/<\/?[a-z][a-z0-9]*>/i`** - ```js run - alert( "

Hi!

".match(/<\/?[a-z][a-z0-9]*>/gi) ); //

,

- ``` +We added an optional slash `pattern:/?` near the beginning of the pattern. Had to escape it with a backslash, otherwise JavaScript would think it is the pattern end. + +```js run +alert( "

Hi!

".match(/<\/?[a-z][a-z0-9]*>/gi) ); //

,

+``` ```smart header="To make a regexp more precise, we often need make it more complex" We can see one common rule in these examples: the more precise is the regular expression -- the longer and more complex it is. -For instance, for HTML tags we could use a simpler regexp: `pattern:<\w+>`. +For instance, for HTML tags we could use a simpler regexp: `pattern:<\w+>`. But as HTML has stricter restrictions for a tag name, `pattern:<[a-z][a-z0-9]*>` is more reliable. -...But because `pattern:\w` means any Latin letter or a digit or `'_'`, the regexp also matches non-tags, for instance `match:<_>`. So it's much simpler than `pattern:<[a-z][a-z0-9]*>`, but less reliable. +Can we use `pattern:<\w+>` or we need `pattern:<[a-z][a-z0-9]*>`? -Are we ok with `pattern:<\w+>` or we need `pattern:<[a-z][a-z0-9]*>`? - -In real life both variants are acceptable. Depends on how tolerant we can be to "extra" matches and whether it's difficult or not to filter them out by other means. +In real life both variants are acceptable. Depends on how tolerant we can be to "extra" matches and whether it's difficult or not to remove them from the result by other means. ``` diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md index c066f3e3..b3290607 100644 --- a/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/3-find-html-comments/solution.md @@ -1,13 +1,11 @@ We need to find the beginning of the comment `match:`. -The first idea could be `pattern:` -- the lazy quantifier makes the dot stop right before `match:-->`. +An acceptable variant is `pattern:` -- the lazy quantifier makes the dot stop right before `match:-->`. We also need to add flag `pattern:s` for the dot to include newlines. -But a dot in JavaScript means "any symbol except the newline". So multiline comments won't be found. - -We can use `pattern:[\s\S]` instead of the dot to match "anything": +Otherwise multiline comments won't be found: ```js run -let reg = //g; +let reg = //gs; let str = `... .. .. diff --git a/9-regular-expressions/10-regexp-greedy-and-lazy/article.md b/9-regular-expressions/10-regexp-greedy-and-lazy/article.md index 5670e57b..e014c16d 100644 --- a/9-regular-expressions/10-regexp-greedy-and-lazy/article.md +++ b/9-regular-expressions/10-regexp-greedy-and-lazy/article.md @@ -8,7 +8,7 @@ Let's take the following task as an example. We have a text and need to replace all quotes `"..."` with guillemet marks: `«...»`. They are preferred for typography in many countries. -For instance: `"Hello, world"` should become `«Hello, world»`. Some countries prefer other quotes, like `„Witam, świat!”` (Polish) or `「你好,世界」` (Chinese), but for our task let's choose `«...»`. +For instance: `"Hello, world"` should become `«Hello, world»`. There exist other quotes, such as `„Witam, świat!”` (Polish) or `「你好,世界」` (Chinese), but for our task let's choose `«...»`. The first thing to do is to locate quoted strings, and then we can replace them. @@ -35,7 +35,7 @@ That can be described as "greediness is the cause of all evil". To find a match, the regular expression engine uses the following algorithm: - For every position in the string - - Match the pattern at that position. + - Try to match the pattern at that position. - If there's no match, go to the next position. These common words do not make it obvious why the regexp fails, so let's elaborate how the search works for the pattern `pattern:".+"`. @@ -44,7 +44,7 @@ These common words do not make it obvious why the regexp fails, so let's elabora The regular expression engine tries to find it at the zero position of the source string `subject:a "witch" and her "broom" is one`, but there's `subject:a` there, so there's immediately no match. - Then it advances: goes to the next positions in the source string and tries to find the first character of the pattern there, and finally finds the quote at the 3rd position: + Then it advances: goes to the next positions in the source string and tries to find the first character of the pattern there, fails again, and finally finds the quote at the 3rd position: ![](witch_greedy1.svg) @@ -54,13 +54,13 @@ These common words do not make it obvious why the regexp fails, so let's elabora ![](witch_greedy2.svg) -3. Then the dot repeats because of the quantifier `pattern:.+`. The regular expression engine builds the match by taking characters one by one while it is possible. +3. Then the dot repeats because of the quantifier `pattern:.+`. The regular expression engine adds to the match one character after another. - ...When does it become impossible? All characters match the dot, so it only stops when it reaches the end of the string: + ...Until when? All characters match the dot, so it only stops when it reaches the end of the string: ![](witch_greedy3.svg) -4. Now the engine finished repeating for `pattern:.+` and tries to find the next character of the pattern. It's the quote `pattern:"`. But there's a problem: the string has finished, there are no more characters! +4. Now the engine finished repeating `pattern:.+` and tries to find the next character of the pattern. It's the quote `pattern:"`. But there's a problem: the string has finished, there are no more characters! The regular expression engine understands that it took too many `pattern:.+` and starts to *backtrack*. @@ -68,9 +68,9 @@ These common words do not make it obvious why the regexp fails, so let's elabora ![](witch_greedy4.svg) - Now it assumes that `pattern:.+` ends one character before the end and tries to match the rest of the pattern from that position. + Now it assumes that `pattern:.+` ends one character before the string end and tries to match the rest of the pattern from that position. - If there were a quote there, then that would be the end, but the last character is `subject:'e'`, so there's no match. + If there were a quote there, then the search would end, but the last character is `subject:'e'`, so there's no match. 5. ...So the engine decreases the number of repetitions of `pattern:.+` by one more character: @@ -84,19 +84,19 @@ These common words do not make it obvious why the regexp fails, so let's elabora 7. The match is complete. -8. So the first match is `match:"witch" and her "broom"`. The further search starts where the first match ends, but there are no more quotes in the rest of the string `subject:is one`, so no more results. +8. So the first match is `match:"witch" and her "broom"`. If the regular expression has flag `pattern:g`, then the search will continue from where the first match ends. There are no more quotes in the rest of the string `subject:is one`, so no more results. That's probably not what we expected, but that's how it works. -**In the greedy mode (by default) the quantifier is repeated as many times as possible.** +**In the greedy mode (by default) a quantifier is repeated as many times as possible.** -The regexp engine tries to fetch as many characters as it can by `pattern:.+`, and then shortens that one by one. +The regexp engine adds to the match as many characters as it can for `pattern:.+`, and then shortens that one by one, if the rest of the pattern doesn't match. -For our task we want another thing. That's what the lazy quantifier mode is for. +For our task we want another thing. That's where a lazy mode can help. ## Lazy mode -The lazy mode of quantifier is an opposite to the greedy mode. It means: "repeat minimal number of times". +The lazy mode of quantifiers is an opposite to the greedy mode. It means: "repeat minimal number of times". We can enable it by putting a question mark `pattern:'?'` after the quantifier, so that it becomes `pattern:*?` or `pattern:+?` or even `pattern:??` for `pattern:'?'`. @@ -149,20 +149,19 @@ Other quantifiers remain greedy. For instance: ```js run -alert( "123 456".match(/\d+ \d+?/g) ); // 123 4 +alert( "123 456".match(/\d+ \d+?/) ); // 123 4 ``` -1. The pattern `pattern:\d+` tries to match as many numbers as it can (greedy mode), so it finds `match:123` and stops, because the next character is a space `pattern:' '`. -2. Then there's a space in pattern, it matches. +1. The pattern `pattern:\d+` tries to match as many digits as it can (greedy mode), so it finds `match:123` and stops, because the next character is a space `pattern:' '`. +2. Then there's a space in the pattern, it matches. 3. Then there's `pattern:\d+?`. The quantifier is in lazy mode, so it finds one digit `match:4` and tries to check if the rest of the pattern matches from there. ...But there's nothing in the pattern after `pattern:\d+?`. The lazy mode doesn't repeat anything without a need. The pattern finished, so we're done. We have a match `match:123 4`. -4. The next search starts from the character `5`. ```smart header="Optimizations" -Modern regular expression engines can optimize internal algorithms to work faster. So they may work a bit different from the described algorithm. +Modern regular expression engines can optimize internal algorithms to work faster. So they may work a bit differently from the described algorithm. But to understand how regular expressions work and to build regular expressions, we don't need to know about that. They are only used internally to optimize things. @@ -264,7 +263,7 @@ That's what's going on: 2. Then it looks for `pattern:.*?`: takes one character (lazily!), check if there's a match for `pattern:" class="doc">` (none). 3. Then takes another character into `pattern:.*?`, and so on... until it finally reaches `match:" class="doc">`. -But the problem is: that's already beyond the link, in another tag `

`. Not what we want. +But the problem is: that's already beyond the link ``, in another tag `

`. Not what we want. Here's the picture of the match aligned with the text: @@ -273,11 +272,9 @@ Here's the picture of the match aligned with the text: ...

``` -So the laziness did not work for us here. +So, we need the pattern to look for ``, but both greedy and lazy variants have problems. -We need the pattern to look for ``, but both greedy and lazy variants have problems. - -The correct variant would be: `pattern:href="[^"]*"`. It will take all characters inside the `href` attribute till the nearest quote, just what we need. +The correct variant can be: `pattern:href="[^"]*"`. It will take all characters inside the `href` attribute till the nearest quote, just what we need. A working example: @@ -301,4 +298,4 @@ Greedy Lazy : Enabled by the question mark `pattern:?` after the quantifier. The regexp engine tries to match the rest of the pattern before each repetition of the quantifier. -As we've seen, the lazy mode is not a "panacea" from the greedy search. An alternative is a "fine-tuned" greedy search, with exclusions. Soon we'll see more examples of it. +As we've seen, the lazy mode is not a "panacea" from the greedy search. An alternative is a "fine-tuned" greedy search, with exclusions, as in the pattern `pattern:"[^"]+"`. diff --git a/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md b/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md index d653ff97..e173aba6 100644 --- a/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md +++ b/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md @@ -1,12 +1,10 @@ A regexp to search 3-digit color `#abc`: `pattern:/#[a-f0-9]{3}/i`. -We can add exactly 3 more optional hex digits. We don't need more or less. Either we have them or we don't. +We can add exactly 3 more optional hex digits. We don't need more or less. The color has either 3 or 6 digits. -The simplest way to add them -- is to append to the regexp: `pattern:/#[a-f0-9]{3}([a-f0-9]{3})?/i` +Let's use the quantifier `pattern:{1,2}` for that: we'll have `pattern:/#([a-f0-9]{3}){1,2}/i`. -We can do it in a smarter way though: `pattern:/#([a-f0-9]{3}){1,2}/i`. - -Here the regexp `pattern:[a-f0-9]{3}` is in parentheses to apply the quantifier `pattern:{1,2}` to it as a whole. +Here the pattern `pattern:[a-f0-9]{3}` is enclosed in parentheses to apply the quantifier `pattern:{1,2}`. In action: diff --git a/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md b/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md index 4efd6f61..d87914e9 100644 --- a/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md +++ b/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md @@ -11,4 +11,4 @@ let str = "color: #3f3; background-color: #AA00ef; and: #abcd"; alert( str.match(reg) ); // #3f3 #AA00ef ``` -P.S. This should be exactly 3 or 6 hex digits: values like `#abcd` should not match. +P.S. This should be exactly 3 or 6 hex digits. Values with 4 digits, such as `#abcd`, should not match. diff --git a/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md b/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md index dd241084..3155f13c 100644 --- a/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md +++ b/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md @@ -1,6 +1,6 @@ A positive number with an optional decimal part is (per previous task): `pattern:\d+(\.\d+)?`. -Let's add an optional `-` in the beginning: +Let's add the optional `pattern:-` in the beginning: ```js run let reg = /-?\d+(\.\d+)?/g; diff --git a/9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md b/9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md index 7707edb0..decb074d 100644 --- a/9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md +++ b/9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md @@ -1,16 +1,19 @@ A regexp for a number is: `pattern:-?\d+(\.\d+)?`. We created it in previous tasks. -An operator is `pattern:[-+*/]`. +An operator is `pattern:[-+*/]`. The hyphen `pattern:-` goes first in the square brackets, because in the middle it would mean a character range, while we just want a character `-`. -Please note: -- Here the dash `pattern:-` goes first in the brackets, because in the middle it would mean a character range, while we just want a character `-`. -- A slash `/` should be escaped inside a JavaScript regexp `pattern:/.../`, we'll do that later. +The slash `/` should be escaped inside a JavaScript regexp `pattern:/.../`, we'll do that later. We need a number, an operator, and then another number. And optional spaces between them. The full regular expression: `pattern:-?\d+(\.\d+)?\s*[-+*/]\s*-?\d+(\.\d+)?`. -To get a result as an array let's put parentheses around the data that we need: numbers and the operator: `pattern:(-?\d+(\.\d+)?)\s*([-+*/])\s*(-?\d+(\.\d+)?)`. +It has 3 parts, with `pattern:\s*` between them: +1. `pattern:-?\d+(\.\d+)?` - the first number, +1. `pattern:[-+*/]` - the operator, +1. `pattern:-?\d+(\.\d+)?` - the second number. + +To make each of these parts a separate element of the result array, let's enclose them in parentheses: `pattern:(-?\d+(\.\d+)?)\s*([-+*/])\s*(-?\d+(\.\d+)?)`. In action: @@ -29,11 +32,11 @@ The result includes: - `result[4] == "12"` (forth group `(-?\d+(\.\d+)?)` -- the second number) - `result[5] == undefined` (fifth group `(\.\d+)?` -- the last decimal part is absent, so it's undefined) -We only want the numbers and the operator, without the full match or the decimal parts. +We only want the numbers and the operator, without the full match or the decimal parts, so let's "clean" the result a bit. -The full match (the arrays first item) can be removed by shifting the array `pattern:result.shift()`. +The full match (the arrays first item) can be removed by shifting the array `result.shift()`. -The decimal groups can be removed by making them into non-capturing groups, by adding `pattern:?:` to the beginning: `pattern:(?:\.\d+)?`. +Groups that contain decimal parts (number 2 and 4) `pattern:(.\d+)` can be excluded by adding `pattern:?:` to the beginning: `pattern:(?:\.\d+)?`. The final solution: diff --git a/9-regular-expressions/11-regexp-groups/article.md b/9-regular-expressions/11-regexp-groups/article.md index 0d858e44..9a3bb04f 100644 --- a/9-regular-expressions/11-regexp-groups/article.md +++ b/9-regular-expressions/11-regexp-groups/article.md @@ -4,83 +4,92 @@ A part of a pattern can be enclosed in parentheses `pattern:(...)`. This is call That has two effects: -1. It allows to place a part of the match into a separate array. -2. If we put a quantifier after the parentheses, it applies to the parentheses as a whole, not the last character. +1. It allows to get a part of the match as a separate item in the result array. +2. If we put a quantifier after the parentheses, it applies to the parentheses as a whole. -## Example +## Examples -In the example below the pattern `pattern:(go)+` finds one or more `match:'go'`: +Let's see how parentheses work in examples. + +### Example: gogogo + +Without parentheses, the pattern `pattern:go+` means `subject:g` character, followed by `subject:o` repeated one or more times. For instance, `match:goooo` or `match:gooooooooo`. + +Parentheses group characters together, so `pattern:(go)+` means `match:go`, `match:gogo`, `match:gogogo` and so on. ```js run alert( 'Gogogo now!'.match(/(go)+/i) ); // "Gogogo" ``` -Without parentheses, the pattern `pattern:/go+/` means `subject:g`, followed by `subject:o` repeated one or more times. For instance, `match:goooo` or `match:gooooooooo`. +### Example: domain -Parentheses group the word `pattern:(go)` together. +Let's make something more complex -- a regular expression to search for a website domain. -Let's make something more complex -- a regexp to match an email. - -Examples of emails: +For example: ``` -my@mail.com -john.smith@site.com.uk +mail.com +users.mail.com +smith.users.mail.com ``` -The pattern: `pattern:[-.\w]+@([\w-]+\.)+[\w-]{2,20}`. +As we can see, a domain consists of repeated words, a dot after each one except the last one. -1. The first part `pattern:[-.\w]+` (before `@`) may include any alphanumeric word characters, a dot and a dash, to match `match:john.smith`. -2. Then `pattern:@`, and the domain. It may be a subdomain like `host.site.com.uk`, so we match it as "a word followed by a dot `pattern:([\w-]+\.)` (repeated), and then the last part must be a word: `match:com` or `match:uk` (but not very long: 2-20 characters). - -That regexp is not perfect, but good enough to fix errors or occasional mistypes. - -For instance, we can find all emails in the string: +In regular expressions that's `pattern:(\w+\.)+\w+`: ```js run -let reg = /[-.\w]+@([\w-]+\.)+[\w-]{2,20}/g; +let regexp = /(\w+\.)+\w+/g; + +alert( "site.com my.site.com".match(regexp) ); // site.com,my.site.com +``` + +The search works, but the pattern can't match a domain with a hyphen, e.g. `my-site.com`, because the hyphen does not belong to class `pattern:\w`. + +We can fix it by replacing `pattern:\w` with `pattern:[\w-]` in every word except the last one: `pattern:([\w-]+\.)+\w+`. + +### Example: email + +The previous example can be extended. We can create a regular expression for emails based on it. + +The email format is: `name@domain`. Any word can be the name, hyphens and dots are allowed. In regular expressions that's `pattern:[-.\w]+`. + +The pattern: + +```js run +let reg = /[-.\w]+@([\w-]+\.)+[\w-]+/g; alert("my@mail.com @ his@site.com.uk".match(reg)); // my@mail.com, his@site.com.uk ``` -In this example parentheses were used to make a group for repetitions `pattern:([\w-]+\.)+`. But there are other uses too, let's see them. +That regexp is not perfect, but mostly works and helps to fix accidental mistypes. The only truly reliable check for an email can only be done by sending a letter. -## Contents of parentheses +## Parentheses contents in the match -Parentheses are numbered from left to right. The search engine remembers the content matched by each of them and allows to reference it in the pattern or in the replacement string. +Parentheses are numbered from left to right. The search engine remembers the content matched by each of them and allows to get it in the result. -For instance, we'd like to find HTML tags `pattern:<.*?>`, and process them. +The method `str.match(regexp)`, if `regexp` has no flag `g`, looks for the first match and returns it as an array: + +1. At index `0`: the full match. +2. At index `1`: the contents of the first parentheses. +3. На позиции `2`: the contents of the second parentheses. +4. ...and so on... + +For instance, we'd like to find HTML tags `pattern:<.*?>`, and process them. It would be convenient to have tag content (what's inside the angles), in a separate variable. Let's wrap the inner content into parentheses, like this: `pattern:<(.*?)>`. -Then we'll get both the tag as a whole and its content: - -```js run -let str = '

Hello, world!

'; -let reg = /<(.*?)>/; - -alert( str.match(reg) ); // Array: ["

", "h1"] -``` - -The call to [String#match](mdn:js/String/match) returns groups only if the regexp only looks for the first match, that is: has no `pattern:/.../g` flag. - -If we need all matches with their groups then we can use `.matchAll` or `regexp.exec` as described in : +Now we'll get both the tag as a whole `match:

` and its contents `match:h1` in the resulting array: ```js run let str = '

Hello, world!

'; -// two matches: opening

and closing

tags -let reg = /<(.*?)>/g; +let tag = str.match(/<(.*?)>/); -let matches = Array.from( str.matchAll(reg) ); - -alert(matches[0]); // Array: ["

", "h1"] -alert(matches[1]); // Array: ["

", "/h1"] +alert( tag[0] ); //

+alert( tag[1] ); // h1 ``` -Here we have two matches for `pattern:<(.*?)>`, each of them is an array with the full match and groups. - -## Nested groups +### Nested groups Parentheses can be nested. In this case the numbering also goes from left to right. @@ -90,7 +99,13 @@ For instance, when searching a tag in `subject:` we may be inte 2. The tag name: `match:span`. 3. The tag attributes: `match:class="my"`. -Let's add parentheses for them: +Let's add parentheses for them: `pattern:<(([a-z]+)\s*([^>]*))>`. + +Here's how they are numbered (left to right, by the opening paren): + +![](regexp-nested-groups-pattern.svg) + +In action: ```js run let str = ''; @@ -98,20 +113,25 @@ let str = ''; let reg = /<(([a-z]+)\s*([^>]*))>/; let result = str.match(reg); -alert(result); // , span class="my", span, class="my" +alert(result[0]); // +alert(result[1]); // span class="my" +alert(result[2]); // span +alert(result[3]); // class="my" ``` -Here's how groups look: +The zero index of `result` always holds the full match. -![](regexp-nested-groups.svg) +Then groups, numbered from left to right by an opening paren. The first group is returned as `result[1]`. Here it encloses the whole tag content. -At the zero index of the `result` is always the full match. +Then in `result[2]` goes the group from the second opening paren `pattern:([a-z]+)` - tag name, then in `result[3]` the tag: `pattern:([^>]*)`. -Then groups, numbered from left to right. Whichever opens first gives the first group `result[1]`. Here it encloses the whole tag content. +The contents of every group in the string: -Then in `result[2]` goes the group from the second opening `pattern:(` till the corresponding `pattern:)` -- tag name, then we don't group spaces, but group attributes for `result[3]`. +![](regexp-nested-groups-matches.svg) -**Even if a group is optional and doesn't exist in the match, the corresponding `result` array item is present (and equals `undefined`).** +### Optional groups + +Even if a group is optional and doesn't exist in the match (e.g. has the quantifier `pattern:(...)?`), the corresponding `result` array item is present and equals `undefined`. For instance, let's consider the regexp `pattern:a(z)?(c)?`. It looks for `"a"` optionally followed by `"z"` optionally followed by `"c"`. @@ -128,10 +148,10 @@ alert( match[2] ); // undefined The array has the length of `3`, but all groups are empty. -And here's a more complex match for the string `subject:ack`: +And here's a more complex match for the string `subject:ac`: ```js run -let match = 'ack'.match(/a(z)?(c)?/) +let match = 'ac'.match(/a(z)?(c)?/) alert( match.length ); // 3 alert( match[0] ); // ac (whole match) @@ -141,11 +161,90 @@ alert( match[2] ); // c The array length is permanent: `3`. But there's nothing for the group `pattern:(z)?`, so the result is `["ac", undefined, "c"]`. +## Searching for all matches with groups: matchAll + +```warn header="`matchAll` is a new method, polyfill may be needed" +The method `matchAll` is not supported in old browsers. + +A polyfill may be required, such as . +``` + +When we search for all matches (flag `pattern:g`), the `match` method does not return contents for groups. + +For example, let's find all tags in a string: + +```js run +let str = '

'; + +let tags = str.match(/<(.*?)>/g); + +alert( tags ); //

,

+``` + +The result is an array of matches, but without details about each of them. But in practice we usually need contents of capturing groups in the result. + +To get them, we should search using the method `str.matchAll(regexp)`. + +It was added to JavaScript language long after `match`, as its "new and improved version". + +Just like `match`, it looks for matches, but there are 3 differences: + +1. It returns not an array, but an iterable object. +2. When the flag `pattern:g` is present, it returns every match as an array with groups. +3. If there are no matches, it returns not `null`, but an empty iterable object. + +For instance: + +```js run +let results = '

'.matchAll(/<(.*?)>/gi); + +// results - is not an array, but an iterable object +alert(results); // [object RegExp String Iterator] +alert(results[0]); // undefined + +results = Array.from(results); // let's turn it into array + +alert(results[0]); //

,h1 (1st tag) +alert(results[1]); //

,h2 (2nd tag) +``` + +As we can see, the first difference is very important. We can't get the match as `results[0]`, because that object isn't pseudoarray. We can turn it into a real `Array` using `Array.from`. There are more details about pseudoarrays and iterables in the article . + +There's no need in `Array.from` if we're looping over results: + +```js run +let results = '

'.matchAll(/<(.*?)>/gi); + +for(let result of results) { + alert(result); + // первый вывод:

,h1 + // второй:

,h2 +} +``` + +...Or using destructuring: + +```js +let [tag1, tag2] = '

'.matchAll(/<(.*?)>/gi); +``` + +```smart header="Why is a result of `matchAll` an iterable object, not an array?" +Why is the method designed like that? The reason is simple - for the optimization. + +The call to `matchAll` does not perform the search. Instead, it returns an iterable object, without the results initially. The search is performed each time we iterate over it, e.g. in the loop. + +So, there will be found as many results as needed, not more. + +E.g. there are potentially 100 matches in the text, but in a `for..of` loop we found 5 of them, then decided it's enough and make a `break`. Then the engine won't spend time finding other 95 mathces. +``` + ## Named groups -Remembering groups by their numbers is hard. For simple patterns it's doable, but for more complex ones we can give names to parentheses. +Remembering groups by their numbers is hard. For simple patterns it's doable, but for more complex ones counting parentheses is inconvenient. We have a much better option: give names to parentheses. -That's done by putting `pattern:?` immediately after the opening paren, like this: +That's done by putting `pattern:?` immediately after the opening paren. + +For example, let's look for a date in the format "year-month-day": ```js run *!* @@ -162,71 +261,75 @@ alert(groups.day); // 30 As you can see, the groups reside in the `.groups` property of the match. -We can also use them in the replacement string, as `pattern:$` (like `$1..9`, but a name instead of a digit). +To look for all dates, we can add flag `pattern:g`. -For instance, let's reformat the date into `day.month.year`: +We'll also need `matchAll` to obtain full matches, together with groups: ```js run -let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/; +let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/g; -let str = "2019-04-30"; +let str = "2019-10-30 2020-01-01"; -let rearranged = str.replace(dateRegexp, '$.$.$'); +let results = str.matchAll(dateRegexp); -alert(rearranged); // 30.04.2019 +for(let result of results) { + let {year, month, day} = result.groups; + + alert(`${day}.${month}.${year}`); + // first alert: 30.10.2019 + // second: 01.01.2020 +} ``` -If we use a function for the replacement, then named `groups` object is always the last argument: +## Capturing groups in replacement + +Method `str.replace(regexp, replacement)` that replaces all matches with `regexp` in `str` allows to use parentheses contents in the `replacement` string. That's done using `pattern:$n`, where `pattern:n` is the group number. + +For example, ```js run -let dateRegexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/; +let str = "John Bull"; +let regexp = /(\w+) (\w+)/; -let str = "2019-04-30"; - -let rearranged = str.replace(dateRegexp, - (str, year, month, day, offset, input, groups) => - `${groups.day}.${groups.month}.${groups.year}` -); - -alert(rearranged); // 30.04.2019 +alert( str.replace(regexp, '$2, $1') ); // Bull, John ``` -Usually, when we intend to use named groups, we don't need positional arguments of the function. For the majority of real-life cases we only need `str` and `groups`. +For named parentheses the reference will be `pattern:$`. -So we can write it a little bit shorter: +For example, let's reformat dates from "year-month-day" to "day.month.year": -```js -let rearranged = str.replace(dateRegexp, (str, ...args) => { - let {year, month, day} = args.pop(); - alert(str); // 2019-04-30 - alert(year); // 2019 - alert(month); // 04 - alert(day); // 30 -}); +```js run +let regexp = /(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})/g; + +let str = "2019-10-30, 2020-01-01"; + +alert( str.replace(regexp, '$.$.$') ); +// 30.10.2019, 01.01.2020 ``` - ## Non-capturing groups with ?: Sometimes we need parentheses to correctly apply a quantifier, but we don't want their contents in results. A group may be excluded by adding `pattern:?:` in the beginning. -For instance, if we want to find `pattern:(go)+`, but don't want to remember the contents (`go`) in a separate array item, we can write: `pattern:(?:go)+`. +For instance, if we want to find `pattern:(go)+`, but don't want the parentheses contents (`go`) as a separate array item, we can write: `pattern:(?:go)+`. -In the example below we only get the name "John" as a separate member of the `results` array: +In the example below we only get the name `match:John` as a separate member of the match: ```js run -let str = "Gogo John!"; +let str = "Gogogo John!"; + *!* -// exclude Gogo from capturing +// ?: exludes 'go' from capturing let reg = /(?:go)+ (\w+)/i; */!* let result = str.match(reg); -alert( result.length ); // 2 +alert( result[0] ); // Gogogo John (full match) alert( result[1] ); // John +alert( result.length ); // 2 (no more items in the array) ``` ## Summary @@ -235,8 +338,13 @@ Parentheses group together a part of the regular expression, so that the quantif Parentheses groups are numbered left-to-right, and can optionally be named with `(?...)`. -The content, matched by a group, can be referenced both in the replacement string as `$1`, `$2` etc, or by the name `$name` if named. +The content, matched by a group, can be obtained in the results: -So, parentheses groups are called "capturing groups", as they "capture" a part of the match. We get that part separately from the result as a member of the array or in `.groups` if it's named. +- The method `str.match` returns capturing groups only without flag `pattern:g`. +- The method `str.matchAll` always returns capturing groups. -We can exclude the group from remembering (make in "non-capturing") by putting `?:` at the start: `(?:...)`, that's used if we'd like to apply a quantifier to the whole group, but don't need it in the result. +If the parentheses have no name, then their contents is available in the match array by its number. Named parentheses are also available in the property `groups`. + +We can also use parentheses contents in the replacement string in `str.replace`: by the number `$n` or the name `$`. + +A group may be excluded from remembering by adding `pattern:?:` in its start. That's used when we need to apply a quantifier to the whole group, but don't remember it as a separate item in the results array. We also can't reference such parentheses in the replacement string. diff --git a/9-regular-expressions/11-regexp-groups/regexp-nested-groups.svg b/9-regular-expressions/11-regexp-groups/regexp-nested-groups-matches.svg similarity index 100% rename from 9-regular-expressions/11-regexp-groups/regexp-nested-groups.svg rename to 9-regular-expressions/11-regexp-groups/regexp-nested-groups-matches.svg diff --git a/9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg b/9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg new file mode 100644 index 00000000..ce61ff3a --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/regexp-nested-groups-pattern.svg @@ -0,0 +1 @@ +< (( [a-z]+ ) \s* ( [^>]* )) >1span class="my"2span3class="my" \ No newline at end of file From 681cae4b6a6abeff864fb606cefca57308693d1c Mon Sep 17 00:00:00 2001 From: Ilya Kantor Date: Fri, 6 Sep 2019 01:15:24 +0300 Subject: [PATCH 4/7] regexp --- .../11-regexp-groups/01-test-mac/solution.md | 21 ++ .../11-regexp-groups/01-test-mac/task.md | 20 ++ .../solution.md | 0 .../task.md | 0 .../solution.md | 0 .../task.md | 0 .../solution.md | 0 .../task.md | 0 .../11-regexp-groups/article.md | 4 +- .../12-regexp-backreferences/article.md | 41 ++- .../13-regexp-alternation/article.md | 37 ++- .../2-insert-after-head/solution.md | 29 ++ .../2-insert-after-head/task.md | 30 ++ .../14-regexp-lookahead-lookbehind/article.md | 87 +++-- .../article.md | 301 ++++++++++++++++++ .../article.md | 297 ----------------- 16 files changed, 505 insertions(+), 362 deletions(-) create mode 100644 9-regular-expressions/11-regexp-groups/01-test-mac/solution.md create mode 100644 9-regular-expressions/11-regexp-groups/01-test-mac/task.md rename 9-regular-expressions/11-regexp-groups/{1-find-webcolor-3-or-6 => 02-find-webcolor-3-or-6}/solution.md (100%) rename 9-regular-expressions/11-regexp-groups/{1-find-webcolor-3-or-6 => 02-find-webcolor-3-or-6}/task.md (100%) rename 9-regular-expressions/11-regexp-groups/{2-find-decimal-numbers => 03-find-decimal-numbers}/solution.md (100%) rename 9-regular-expressions/11-regexp-groups/{2-find-decimal-numbers => 03-find-decimal-numbers}/task.md (100%) rename 9-regular-expressions/11-regexp-groups/{5-parse-expression => 04-parse-expression}/solution.md (100%) rename 9-regular-expressions/11-regexp-groups/{5-parse-expression => 04-parse-expression}/task.md (100%) create mode 100644 9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/solution.md create mode 100644 9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/task.md create mode 100644 9-regular-expressions/15-regexp-catastrophic-backtracking/article.md delete mode 100644 9-regular-expressions/15-regexp-infinite-backtracking-problem/article.md diff --git a/9-regular-expressions/11-regexp-groups/01-test-mac/solution.md b/9-regular-expressions/11-regexp-groups/01-test-mac/solution.md new file mode 100644 index 00000000..c16f0565 --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/01-test-mac/solution.md @@ -0,0 +1,21 @@ +A two-digit hex number is `pattern:[0-9a-f]{2}` (assuming the flag `pattern:i` is set). + +We need that number `NN`, and then `:NN` repeated 5 times (more numbers); + +The regexp is: `pattern:[0-9a-f]{2}(:[0-9a-f]{2}){5}` + +Now let's show that the match should capture all the text: start at the beginning and end at the end. That's done by wrapping the pattern in `pattern:^...$`. + +Finally: + +```js run +let reg = /^[0-9a-fA-F]{2}(:[0-9a-fA-F]{2}){5}$/i; + +alert( reg.test('01:32:54:67:89:AB') ); // true + +alert( reg.test('0132546789AB') ); // false (no colons) + +alert( reg.test('01:32:54:67:89') ); // false (5 numbers, need 6) + +alert( reg.test('01:32:54:67:89:ZZ') ) // false (ZZ in the end) +``` diff --git a/9-regular-expressions/11-regexp-groups/01-test-mac/task.md b/9-regular-expressions/11-regexp-groups/01-test-mac/task.md new file mode 100644 index 00000000..e7265598 --- /dev/null +++ b/9-regular-expressions/11-regexp-groups/01-test-mac/task.md @@ -0,0 +1,20 @@ +# Check MAC-address + +[MAC-address](https://en.wikipedia.org/wiki/MAC_address) of a network interface consists of 6 two-digit hex numbers separated by a colon. + +For instance: `subject:'01:32:54:67:89:AB'`. + +Write a regexp that checks whether a string is MAC-address. + +Usage: +```js +let reg = /your regexp/; + +alert( reg.test('01:32:54:67:89:AB') ); // true + +alert( reg.test('0132546789AB') ); // false (no colons) + +alert( reg.test('01:32:54:67:89') ); // false (5 numbers, must be 6) + +alert( reg.test('01:32:54:67:89:ZZ') ) // false (ZZ ad the end) +``` diff --git a/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md b/9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/solution.md similarity index 100% rename from 9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/solution.md rename to 9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/solution.md diff --git a/9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md b/9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-groups/1-find-webcolor-3-or-6/task.md rename to 9-regular-expressions/11-regexp-groups/02-find-webcolor-3-or-6/task.md diff --git a/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md b/9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/solution.md similarity index 100% rename from 9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/solution.md rename to 9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/solution.md diff --git a/9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/task.md b/9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-groups/2-find-decimal-numbers/task.md rename to 9-regular-expressions/11-regexp-groups/03-find-decimal-numbers/task.md diff --git a/9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md b/9-regular-expressions/11-regexp-groups/04-parse-expression/solution.md similarity index 100% rename from 9-regular-expressions/11-regexp-groups/5-parse-expression/solution.md rename to 9-regular-expressions/11-regexp-groups/04-parse-expression/solution.md diff --git a/9-regular-expressions/11-regexp-groups/5-parse-expression/task.md b/9-regular-expressions/11-regexp-groups/04-parse-expression/task.md similarity index 100% rename from 9-regular-expressions/11-regexp-groups/5-parse-expression/task.md rename to 9-regular-expressions/11-regexp-groups/04-parse-expression/task.md diff --git a/9-regular-expressions/11-regexp-groups/article.md b/9-regular-expressions/11-regexp-groups/article.md index 9a3bb04f..855568be 100644 --- a/9-regular-expressions/11-regexp-groups/article.md +++ b/9-regular-expressions/11-regexp-groups/article.md @@ -65,7 +65,7 @@ That regexp is not perfect, but mostly works and helps to fix accidental mistype ## Parentheses contents in the match -Parentheses are numbered from left to right. The search engine remembers the content matched by each of them and allows to get it in the result. +Parentheses are numbered from left to right. The search engine memorizes the content matched by each of them and allows to get it in the result. The method `str.match(regexp)`, if `regexp` has no flag `g`, looks for the first match and returns it as an array: @@ -347,4 +347,4 @@ If the parentheses have no name, then their contents is available in the match a We can also use parentheses contents in the replacement string in `str.replace`: by the number `$n` or the name `$`. -A group may be excluded from remembering by adding `pattern:?:` in its start. That's used when we need to apply a quantifier to the whole group, but don't remember it as a separate item in the results array. We also can't reference such parentheses in the replacement string. +A group may be excluded from numbering by adding `pattern:?:` in its start. That's used when we need to apply a quantifier to the whole group, but don't want it as a separate item in the results array. We also can't reference such parentheses in the replacement string. diff --git a/9-regular-expressions/12-regexp-backreferences/article.md b/9-regular-expressions/12-regexp-backreferences/article.md index eff5cab4..07d2ca07 100644 --- a/9-regular-expressions/12-regexp-backreferences/article.md +++ b/9-regular-expressions/12-regexp-backreferences/article.md @@ -1,31 +1,31 @@ -# Backreferences in pattern: \n and \k +# Backreferences in pattern: \N and \k -We can use the contents of capturing groups `(...)` not only in the result or in the replacement string, but also in the pattern itself. +We can use the contents of capturing groups `pattern:(...)` not only in the result or in the replacement string, but also in the pattern itself. -## Backreference by number: \n +## Backreference by number: \N -A group can be referenced in the pattern using `\n`, where `n` is the group number. +A group can be referenced in the pattern using `pattern:\N`, where `N` is the group number. -To make things clear let's consider a task. +To make clear why that's helpful, let's consider a task. -We need to find a quoted string: either a single-quoted `subject:'...'` or a double-quoted `subject:"..."` -- both variants need to match. +We need to find quoted strings: either single-quoted `subject:'...'` or a double-quoted `subject:"..."` -- both variants should match. -How to look for them? +How to find them? -We can put both kinds of quotes in the square brackets: `pattern:['"](.*?)['"]`, but it would find strings with mixed quotes, like `match:"...'` and `match:'..."`. That would lead to incorrect matches when one quote appears inside other ones, like the string `subject:"She's the one!"`: +We can put both kinds of quotes in the square brackets: `pattern:['"](.*?)['"]`, but it would find strings with mixed quotes, like `match:"...'` and `match:'..."`. That would lead to incorrect matches when one quote appears inside other ones, like in the string `subject:"She's the one!"`: ```js run let str = `He said: "She's the one!".`; let reg = /['"](.*?)['"]/g; -// The result is not what we expect +// The result is not what we'd like to have alert( str.match(reg) ); // "She' ``` -As we can see, the pattern found an opening quote `match:"`, then the text is consumed lazily till the other quote `match:'`, that closes the match. +As we can see, the pattern found an opening quote `match:"`, then the text is consumed till the other quote `match:'`, that closes the match. -To make sure that the pattern looks for the closing quote exactly the same as the opening one, we can wrap it into a capturing group and use the backreference. +To make sure that the pattern looks for the closing quote exactly the same as the opening one, we can wrap it into a capturing group and backreference it: `pattern:(['"])(.*?)\1`. Here's the correct code: @@ -39,20 +39,27 @@ let reg = /(['"])(.*?)\1/g; alert( str.match(reg) ); // "She's the one!" ``` -Now it works! The regular expression engine finds the first quote `pattern:(['"])` and remembers the content of `pattern:(...)`, that's the first capturing group. +Now it works! The regular expression engine finds the first quote `pattern:(['"])` and memorizes its content. That's the first capturing group. Further in the pattern `pattern:\1` means "find the same text as in the first group", exactly the same quote in our case. -Please note: +Similar to that, `pattern:\2` would mean the contents of the second group, `pattern:\3` - the 3rd group, and so on. -- To reference a group inside a replacement string -- we use `$1`, while in the pattern -- a backslash `\1`. -- If we use `?:` in the group, then we can't reference it. Groups that are excluded from capturing `(?:...)` are not remembered by the engine. +```smart +If we use `?:` in the group, then we can't reference it. Groups that are excluded from capturing `(?:...)` are not memorized by the engine. +``` + +```warn header="Don't mess up: in the pattern `pattern:\1`, in the replacement: `pattern:$1`" +In the replacement string we use a dollar sign: `pattern:$1`, while in the pattern - a backslash `pattern:\1`. +``` ## Backreference by name: `\k` -For named groups, we can backreference by `\k`. +If a regexp has many parentheses, it's convenient to give them names. -The same example with the named group: +To reference a named group we can use `pattern:\k<имя>`. + +In the example below the group with quotes is named `pattern:?`, so the backreference is `pattern:\k`: ```js run let str = `He said: "She's the one!".`; diff --git a/9-regular-expressions/13-regexp-alternation/article.md b/9-regular-expressions/13-regexp-alternation/article.md index b26f7e4a..5dcb9e86 100644 --- a/9-regular-expressions/13-regexp-alternation/article.md +++ b/9-regular-expressions/13-regexp-alternation/article.md @@ -18,7 +18,7 @@ let str = "First HTML appeared, then CSS, then JavaScript"; alert( str.match(reg) ); // 'HTML', 'CSS', 'JavaScript' ``` -We already know a similar thing -- square brackets. They allow to choose between multiple character, for instance `pattern:gr[ae]y` matches `match:gray` or `match:grey`. +We already saw a similar thing -- square brackets. They allow to choose between multiple characters, for instance `pattern:gr[ae]y` matches `match:gray` or `match:grey`. Square brackets allow only characters or character sets. Alternation allows any expressions. A regexp `pattern:A|B|C` means one of expressions `A`, `B` or `C`. @@ -27,30 +27,41 @@ For instance: - `pattern:gr(a|e)y` means exactly the same as `pattern:gr[ae]y`. - `pattern:gra|ey` means `match:gra` or `match:ey`. -To separate a part of the pattern for alternation we usually enclose it in parentheses, like this: `pattern:before(XXX|YYY)after`. +To apply alternation to a chosen part of the pattern, we can enclose it in parentheses: +- `pattern:I love HTML|CSS` matches `match:I love HTML` or `match:CSS`. +- `pattern:I love (HTML|CSS)` matches `match:I love HTML` or `match:I love CSS`. -## Regexp for time +## Example: regexp for time -In previous chapters there was a task to build a regexp for searching time in the form `hh:mm`, for instance `12:00`. But a simple `pattern:\d\d:\d\d` is too vague. It accepts `25:99` as the time (as 99 seconds match the pattern). +In previous articles there was a task to build a regexp for searching time in the form `hh:mm`, for instance `12:00`. But a simple `pattern:\d\d:\d\d` is too vague. It accepts `25:99` as the time (as 99 seconds match the pattern, but that time is invalid). -How can we make a better one? +How can we make a better pattern? -We can apply more careful matching. First, the hours: +We can use more careful matching. First, the hours: -- If the first digit is `0` or `1`, then the next digit can by anything. -- Or, if the first digit is `2`, then the next must be `pattern:[0-3]`. +- If the first digit is `0` or `1`, then the next digit can be any: `pattern:[01]\d`. +- Otherwise, if the first digit is `2`, then the next must be `pattern:[0-3]`. +- (no other first digit is allowed) -As a regexp: `pattern:[01]\d|2[0-3]`. +We can write both variants in a regexp using alternation: `pattern:[01]\d|2[0-3]`. -Next, the minutes must be from `0` to `59`. In the regexp language that means `pattern:[0-5]\d`: the first digit `0-5`, and then any digit. +Next, minutes must be from `00` to `59`. In the regular expression language that can be written as `pattern:[0-5]\d`: the first digit `0-5`, and then any digit. -Let's glue them together into the pattern: `pattern:[01]\d|2[0-3]:[0-5]\d`. +If we glue minutes and seconds together, we get the pattern: `pattern:[01]\d|2[0-3]:[0-5]\d`. We're almost done, but there's a problem. The alternation `pattern:|` now happens to be between `pattern:[01]\d` and `pattern:2[0-3]:[0-5]\d`. -That's wrong, as it should be applied only to hours `[01]\d` OR `2[0-3]`. That's a common mistake when starting to work with regular expressions. +That is: minutes are added to the second alternation variant, here's a clear picture: -The correct variant: +``` +[01]\d | 2[0-3]:[0-5]\d +``` + +That pattern looks for `pattern:[01]\d` or `pattern:2[0-3]:[0-5]\d`. + +But that's wrong, the alternation should only be used in the "hours" part of the regular expression, to allow `pattern:[01]\d` OR `pattern:2[0-3]`. Let's correct that by enclosing "hours" into parentheses: `pattern:([01]\d|2[0-3]):[0-5]\d`. + +The final solution: ```js run let reg = /([01]\d|2[0-3]):[0-5]\d/g; diff --git a/9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/solution.md b/9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/solution.md new file mode 100644 index 00000000..980a7fe6 --- /dev/null +++ b/9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/solution.md @@ -0,0 +1,29 @@ + +Для того, чтобы вставить после тега ``, нужно вначале его найти. Будем использовать регулярное выражение `pattern:`. + +Далее, нам нужно оставить сам тег `` на месте и добавить текст после него. + +Это можно сделать вот так: +```js run +let str = '......'; +str = str.replace(//, '$&

Hello

'); + +alert(str); // ...

Hello

... +``` + +В строке замены `$&` означает само совпадение, то есть мы заменяем `pattern:` заменяется на самого себя плюс `

Hello

`. + +Альтернативный вариант - использовать ретроспективную проверку: + +```js run +let str = '......'; +str = str.replace(/(?<=)/, `

Hello

`); + +alert(str); // ...

Hello

... +``` + +Такое регулярное выражение на каждой позиции будет проверять, не идёт ли прямо перед ней `pattern:`. Если да - совпадение найдено. Но сам тег `pattern:` в совпадение не входит, он только участвует в проверке. А других символов после проверки в нём нет, так что текст совпадения будет пустым. + +Происходит замена "пустой строки", перед которой идёт `pattern:` на `

Hello

`. Что, как раз, и есть вставка этой строки после ``. + +P.S. Этому регулярному выражению не помешают флаги: `pattern://si`, чтобы в "точку" входил перевод строки (тег может занимать несколько строк), а также чтобы теги в другом регистре типа `match:` тоже находились. diff --git a/9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/task.md b/9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/task.md new file mode 100644 index 00000000..7bdfcd67 --- /dev/null +++ b/9-regular-expressions/14-regexp-lookahead-lookbehind/2-insert-after-head/task.md @@ -0,0 +1,30 @@ +# Вставьте после фрагмента + +Есть строка с HTML-документом. + +Вставьте после тега `` (у него могут быть атрибуты) строку `

Hello

`. + +Например: + +```js +let reg = /ваше регулярное выражение/; + +let str = ` + + + ... + + +`; + +str = str.replace(reg, `

Hello

`); +``` + +После этого значение `str`: +```html + +

Hello

+ ... + + +``` diff --git a/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md b/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md index 8e36fb0b..1115c502 100644 --- a/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md +++ b/9-regular-expressions/14-regexp-lookahead-lookbehind/article.md @@ -1,54 +1,82 @@ # Lookahead and lookbehind -Sometimes we need to match a pattern only if followed by another pattern. For instance, we'd like to get the price from a string like `subject:1 turkey costs 30€`. +Sometimes we need to find only those matches for a pattern that are followed or preceeded by another pattern. -We need a number (let's say a price has no decimal point) followed by `subject:€` sign. +There's a special syntax for that, called "lookahead" and "lookbehind", together referred to as "lookaround". -That's what lookahead is for. +For the start, let's find the price from the string like `subject:1 turkey costs 30€`. That is: a number, followed by `subject:€` sign. ## Lookahead -The syntax is: `pattern:x(?=y)`, it means "look for `pattern:x`, but match only if followed by `pattern:y`". +The syntax is: `pattern:X(?=Y)`, it means "look for `pattern:X`, but match only if followed by `pattern:Y`". There may be any pattern instead of `pattern:X` and `pattern:Y`. -For an integer amount followed by `subject:€`, the regexp will be `pattern:\d+(?=€)`: +For an integer number followed by `subject:€`, the regexp will be `pattern:\d+(?=€)`: ```js run let str = "1 turkey costs 30€"; -alert( str.match(/\d+(?=€)/) ); // 30 (correctly skipped the sole number 1) +alert( str.match(/\d+(?=€)/) ); // 30, the number 1 is ignored, as it's not followed by € ``` -Let's say we want a quantity instead, that is a number, NOT followed by `subject:€`. +Please note: the lookahead is merely a test, the contents of the parentheses `pattern:(?=...)` is not included in the result `match:30`. -Here a negative lookahead can be applied. +When we look for `pattern:X(?=Y)`, the regular expression engine finds `pattern:X` and then checks if there's `pattern:Y` immediately after it. If it's not so, then the potential match is skipped, and the search continues. -The syntax is: `pattern:x(?!y)`, it means "search `pattern:x`, but only if not followed by `pattern:y`". +More complex tests are possible, e.g. `pattern:X(?=Y)(?=Z)` means: + +1. Find `pattern:X`. +2. Check if `pattern:Y` is immediately after `pattern:X` (skip if isn't). +3. Check if `pattern:Z` is immediately after `pattern:X` (skip if isn't). +4. If both tests passed, then it's the match. + +In other words, such pattern means that we're looking for `pattern:X` followed by `pattern:Y` and `pattern:Z` at the same time. + +That's only possible if patterns `pattern:Y` and `pattern:Z` aren't mutually exclusive. + +For example, `pattern:\d+(?=\s)(?=.*30)` looks for `pattern:\d+` only if it's followed by a space, and there's `30` somewhere after it: + +```js run +let str = "1 turkey costs 30€"; + +alert( str.match(/\d+(?=\s)(?=.*30)/) ); // 1 +``` + +In our string that exactly matches the number `1`. + +## Negative lookahead + +Let's say that we want a quantity instead, not a price from the same string. That's a number `pattern:\d+`, NOT followed by `subject:€`. + +For that, a negative lookahead can be applied. + +The syntax is: `pattern:X(?!Y)`, it means "search `pattern:X`, but only if not followed by `pattern:Y`". ```js run let str = "2 turkeys cost 60€"; -alert( str.match(/\d+(?!€)/) ); // 2 (correctly skipped the price) +alert( str.match(/\d+(?!€)/) ); // 2 (the price is skipped) ``` ## Lookbehind -Lookahead allows to add a condition for "what goes after". +Lookahead allows to add a condition for "what follows". -Lookbehind is similar, but it looks behind. That is, it allows to match a pattern only if there's something before. +Lookbehind is similar, but it looks behind. That is, it allows to match a pattern only if there's something before it. The syntax is: -- Positive lookbehind: `pattern:(?<=y)x`, matches `pattern:x`, but only if it follows after `pattern:y`. -- Negative lookbehind: `pattern:(? + +```js run +let reg = /^(\d+)*$/; + +let str = "012345678901234567890123456789!"; + +// will take a very long time +alert( reg.test(str) ); +``` + +So what's wrong with the regexp? + +First, one may notice that the regexp `pattern:(\d+)*` is a little bit strange. The quantifier `pattern:*` looks extraneous. If we want a number, we can use `pattern:\d+`. + +Indeed, the regexp is artificial. But the reason why it is slow is the same as those we saw above. So let's understand it, and then the previous example will become obvious. + +What happens during the search of `pattern:^(\d+)*$` in the line `subject:123456789!` (shortened a bit for clarity), why does it take so long? + +1. First, the regexp engine tries to find a number `pattern:\d+`. The plus `pattern:+` is greedy by default, so it consumes all digits: + + ``` + \d+....... + (123456789)z + ``` + + Then it tries to apply the star quantifier, but there are no more digits, so it the star doesn't give anything. + + The next in the pattern is the string end `pattern:$`, but in the text we have `subject:!`, so there's no match: + + ``` + X + \d+........$ + (123456789)! + ``` + +2. As there's no match, the greedy quantifier `pattern:+` decreases the count of repetitions, backtracks one character back. + + Now `pattern:\d+` takes all digits except the last one: + ``` + \d+....... + (12345678)9! + ``` +3. Then the engine tries to continue the search from the new position (`9`). + + The star `pattern:(\d+)*` can be applied -- it gives the number `match:9`: + + ``` + + \d+.......\d+ + (12345678)(9)! + ``` + + The engine tries to match `pattern:$` again, but fails, because meets `subject:!`: + + ``` + X + \d+.......\d+ + (12345678)(9)z + ``` + + +4. There's no match, so the engine will continue backtracking, decreasing the number of repetitions. Backtracking generally works like this: the last greedy quantifier decreases the number of repetitions until it can. Then the previous greedy quantifier decreases, and so on. + + All possible combinations are attempted. Here are their examples. + + The first number `pattern:\d+` has 7 digits, and then a number of 2 digits: + + ``` + X + \d+......\d+ + (1234567)(89)! + ``` + + The first number has 7 digits, and then two numbers of 1 digit each: + + ``` + X + \d+......\d+\d+ + (1234567)(8)(9)! + ``` + + The first number has 6 digits, and then a number of 3 digits: + + ``` + X + \d+.......\d+ + (123456)(789)! + ``` + + The first number has 6 digits, and then 2 numbers: + + ``` + X + \d+.....\d+ \d+ + (123456)(78)(9)! + ``` + + ...And so on. + + +There are many ways to split a set of digits `123456789` into numbers. To be precise, there are 2n-1, where `n` is the length of the set. + +For `n=20` there are about 1 million combinations, for `n=30` - a thousand times more. Trying each of them is exactly the reason why the search takes so long. + +What to do? + +Should we turn on the lazy mode? + +Unfortunately, that won't help: if we replace `pattern:\d+` with `pattern:\d+?`, the regexp will still hang. The order of combinations will change, but not their total count. + +Some regular expression engines have tricky tests and finite automations that allow to avoid going through all combinations or make it much faster, but not all engines, and not in all cases. + +## Back to words and strings + +The similar thing happens in our first example, when we look words by pattern `pattern:^(\w+\s?)*$` in the string `subject:An input that hangs!`. + +The reason is that a word can be represented as one `pattern:\w+` or many: + +``` +(input) +(inpu)(t) +(inp)(u)(t) +(in)(p)(ut) +... +``` + +For a human, it's obvious that there may be no match, because the string ends with an exclamation sign `!`, but the regular expression expects a wordly character `pattern:\w` or a space `pattern:\s` at the end. But the engine doesn't know that. + +It tries all combinations of how the regexp `pattern:(\w+\s?)*` can "consume" the string, including variants with spaces `pattern:(\w+\s)*` and without them `pattern:(\w+)*` (because spaces `pattern:\s?` are optional). As there are many such combinations, the search takes a lot of time. + +## How to fix? + +There are two main approaches to fixing the problem. + +The first is to lower the number of possible combinations. + +Let's rewrite the regular expression as `pattern:^(\w+\s)*\w*` - we'll look for any number of words followed by a space `pattern:(\w+\s)*`, and then (optionally) a word `pattern:\w*`. + +This regexp is equivalent to the previous one (matches the same) and works well: + +```js run +let reg = /^(\w+\s)*\w*$/; +let str = "An input string that takes a long time or even makes this regex to hang!"; + +alert( reg.test(str) ); // false +``` + +Why did the problem disappear? + +Now the star `pattern:*` goes after `pattern:\w+\s` instead of `pattern:\w+\s?`. It became impossible to represent one word of the string with multiple successive `pattern:\w+`. The time needed to try such combinations is now saved. + +For example, the previous pattern `pattern:(\w+\s?)*` could match the word `subject:string` as two `pattern:\w+`: + +```js run +\w+\w+ +string +``` + +The previous pattern, due to the optional `pattern:\s` allowed variants `pattern:\w+`, `pattern:\w+\s`, `pattern:\w+\w+` and so on. + +With the rewritten pattern `pattern:(\w+\s)*`, that's impossible: there may be `pattern:\w+\s` or `pattern:\w+\s\w+\s`, but not `pattern:\w+\w+`. So the overall combinations count is greatly decreased. + +## Preventing backtracking + +It's not always convenient to rewrite a regexp. And it's not always obvious how to do it. + +The alternative approach is to forbid backtracking for the quantifier. + +The regular expressions engine tries many combinations that are obviously wrong for a human. + +E.g. in the regexp `pattern:(\d+)*$` it's obvious for a human, that `pattern:+` shouldn't backtrack. If we replace one `pattern:\d+` with two separate `pattern:\d+\d+`, nothing changes: + +``` +\d+........ +(123456789)! + +\d+...\d+.... +(1234)(56789)! +``` + +And in the original example `pattern:^(\w+\s?)*$` we may want to forbid backtracking in `pattern:\w+`. That is: `pattern:\w+` should match a whole word, with the maximal possible length. There's no need to lower the repetitions count in `pattern:\w+`, try to split it into two words `pattern:\w+\w+` and so on. + +Modern regular expression engines support possessive quantifiers for that. They are like greedy ones, but don't backtrack (so they are actually simpler than regular quantifiers). + +There are also so-called "atomic capturing groups" - a way to disable backtracking inside parentheses. + +Unfortunately, in JavaScript they are not supported. But there's another way. + +### Lookahead to the rescue! + +We can prevent backtracking using lookahead. + +The pattern to take as much repetitions of `pattern:\w` as possible without backtracking is: `pattern:(?=(\w+))\1`. + +Let's decipher it: +- Lookahead `pattern:?=` looks forward for the longest word `pattern:\w+` starting at the current position. +- The contents of parentheses with `pattern:?=...` isn't memorized by the engine, so wrap `pattern:\w+` into parentheses. Then the engine will memorize their contents +- ...And allow us to reference it in the pattern as `pattern:\1`. + +That is: we look ahead - and if there's a word `pattern:\w+`, then match it as `pattern:\1`. + +Why? That's because the lookahead finds a word `pattern:\w+` as a whole and we capture it into the pattern with `pattern:\1`. So we essentially implemented a possessive plus `pattern:+` quantifier. It captures only the whole word `pattern:\w+`, not a part of it. + +For instance, in the word `subject:JavaScript` it may not only match `match:Java`, but leave out `match:Script` to match the rest of the pattern. + +Here's the comparison of two patterns: + +```js run +alert( "JavaScript".match(/\w+Script/)); // JavaScript +alert( "JavaScript".match(/(?=(\w+))\1Script/)); // null +``` + +1. In the first variant `pattern:\w+` first captures the whole word `subject:JavaScript` but then `pattern:+` backtracks character by character, to try to match the rest of the pattern, until it finally succeeds (when `pattern:\w+` matches `match:Java`). +2. In the second variant `pattern:(?=(\w+))` looks ahead and finds the word `subject:JavaScript`, that is included into the pattern as a whole by `pattern:\1`, so there remains no way to find `subject:Script` after it. + +We can put a more complex regular expression into `pattern:(?=(\w+))\1` instead of `pattern:\w`, when we need to forbid backtracking for `pattern:+` after it. + +```smart +There's more about the relation between possessive quantifiers and lookahead in articles [Regex: Emulate Atomic Grouping (and Possessive Quantifiers) with LookAhead](http://instanceof.me/post/52245507631/regex-emulate-atomic-grouping-with-lookahead) and [Mimicking Atomic Groups](http://blog.stevenlevithan.com/archives/mimic-atomic-groups). +``` + +Let's rewrite the first example using lookahead to prevent backtracking: + +```js run +let reg = /^((?=(\w+))\2\s?)*$/; + +alert( reg.test("A good string") ); // true + +let str = "An input string that takes a long time or even makes this regex to hang!"; + +alert( reg.test(str) ); // false, works and fast! +``` + +Here `pattern:\2` is used instead of `pattern:\1`, because there are additional outer parentheses. To avoid messing up with the numbers, we can give the parentheses a name, e.g. `pattern:(?\w+)`. + +```js run +// parentheses are named ?, referenced as \k +let reg = /^((?=(?\w+))\k\s?)*$/; + +let str = "An input string that takes a long time or even makes this regex to hang!"; + +alert( reg.test(str) ); // false + +alert( reg.test("A correct string") ); // true +``` + +The problem described in this article is called "catastrophic backtracking". + +We covered two ways how to solve it: +- Rewrite the regexp to lower the possible combinations count. +- Prevent backtracking. diff --git a/9-regular-expressions/15-regexp-infinite-backtracking-problem/article.md b/9-regular-expressions/15-regexp-infinite-backtracking-problem/article.md deleted file mode 100644 index 67f3e93c..00000000 --- a/9-regular-expressions/15-regexp-infinite-backtracking-problem/article.md +++ /dev/null @@ -1,297 +0,0 @@ -# Infinite backtracking problem - -Some regular expressions are looking simple, but can execute veeeeeery long time, and even "hang" the JavaScript engine. - -Sooner or later most developers occasionally face such behavior. - -The typical situation -- a regular expression works fine sometimes, but for certain strings it "hangs" consuming 100% of CPU. - -In a web-browser it kills the page. Not a good thing for sure. - -For server-side JavaScript it may become a vulnerability, and it uses regular expressions to process user data. Bad input will make the process hang, causing denial of service. The author personally saw and reported such vulnerabilities even for very well-known and widely used programs. - -So the problem is definitely worth to deal with. - -## Introduction - -The plan will be like this: - -1. First we see the problem how it may occur. -2. Then we simplify the situation and see why it occurs. -3. Then we fix it. - -For instance let's consider searching tags in HTML. - -We want to find all tags, with or without attributes -- like `subject:
`. We need the regexp to work reliably, because HTML comes from the internet and can be messy. - -In particular, we need it to match tags like `` -- with `<` and `>` in attributes. That's allowed by [HTML standard](https://html.spec.whatwg.org/multipage/syntax.html#syntax-attributes). - -A simple regexp like `pattern:<[^>]+>` doesn't work, because it stops at the first `>`, and we need to ignore `<>` if inside an attribute: - -```js run -// the match doesn't reach the end of the tag - wrong! -alert( ''.match(/<[^>]+>/) ); // `. - -That regexp is not perfect! It doesn't support all the details of HTML syntax, such as unquoted values, and there are other ways to improve, but let's not add complexity. It will demonstrate the problem for us. - -The regexp seems to work: - -```js run -let reg = /<\w+(\s*\w+="[^"]*"\s*)*>/g; - -let str='...... ...'; - -alert( str.match(reg) ); // , -``` - -Great! It found both the long tag `match:` and the short one `match:`. - -Now, that we've got a seemingly working solution, let's get to the infinite backtracking itself. - -## Infinite backtracking - -If you run our regexp on the input below, it may hang the browser (or another JavaScript host): - -```js run -let reg = /<\w+(\s*\w+="[^"]*"\s*)*>/g; - -let str = ``. - -Unfortunately, the regexp still hangs: - -```js run -// only search for space-delimited attributes -let reg = /<(\s*\w+=\w+\s*)*>/g; - -let str = `` in the string `subject:` at the end, so the match is impossible, but the regexp engine doesn't know about it. The search backtracks trying different combinations of `pattern:(\s*\w+=\w+\s*)`: - -``` -(a=b a=b a=b) (a=b) -(a=b a=b) (a=b a=b) -(a=b) (a=b a=b a=b) -... -``` - -As there are many combinations, it takes a lot of time. - -## How to fix? - -The backtracking checks many variants that are an obvious fail for a human. - -For instance, in the pattern `pattern:(\d+)*$` a human can easily see that `pattern:(\d+)*` does not need to backtrack `pattern:+`. There's no difference between one or two `\d+`: - -``` -\d+........ -(123456789)z - -\d+...\d+.... -(1234)(56789)z -``` - -Let's get back to more real-life example: `pattern:<(\s*\w+=\w+\s*)*>`. We want it to find pairs `name=value` (as many as it can). - -What we would like to do is to forbid backtracking. - -There's totally no need to decrease the number of repetitions. - -In other words, if it found three `name=value` pairs and then can't find `>` after them, then there's no need to decrease the count of repetitions. There are definitely no `>` after those two (we backtracked one `name=value` pair, it's there): - -``` -(name=value) name=value -``` - -Modern regexp engines support so-called "possessive" quantifiers for that. They are like greedy, but don't backtrack at all. Pretty simple, they capture whatever they can, and the search continues. There's also another tool called "atomic groups" that forbid backtracking inside parentheses. - -Unfortunately, but both these features are not supported by JavaScript. - -### Lookahead to the rescue - -We can forbid backtracking using lookahead. - -The pattern to take as much repetitions as possible without backtracking is: `pattern:(?=(a+))\1`. - -In other words: -- The lookahead `pattern:?=` looks for the maximal count `pattern:a+` from the current position. -- And then they are "consumed into the result" by the backreference `pattern:\1` (`pattern:\1` corresponds to the content of the second parentheses, that is `pattern:a+`). - -There will be no backtracking, because lookahead does not backtrack. If, for -example, it found 5 instances of `pattern:a+` and the further match failed, -it won't go back to the 4th instance. - -```smart -There's more about the relation between possessive quantifiers and lookahead in articles [Regex: Emulate Atomic Grouping (and Possessive Quantifiers) with LookAhead](http://instanceof.me/post/52245507631/regex-emulate-atomic-grouping-with-lookahead) and [Mimicking Atomic Groups](http://blog.stevenlevithan.com/archives/mimic-atomic-groups). -``` - -So this trick makes the problem disappear. - -Let's fix the regexp for a tag with attributes from the beginning of the chapter`pattern:<\w+(\s*\w+=(\w+|"[^"]*")\s*)*>`. We'll use lookahead to prevent backtracking of `name=value` pairs: - -```js run -// regexp to search name=value -let attrReg = /(\s*\w+=(\w+|"[^"]*")\s*)/ - -// use new RegExp to nicely insert its source into (?=(a+))\1 -let fixedReg = new RegExp(`<\\w+(?=(${attrReg.source}*))\\1>`, 'g'); - -let goodInput = '...... ...'; - -let badInput = `, -alert( badInput.match(fixedReg) ); // null (no results, fast!) -``` - -Great, it works! We found both a long tag `match:` and a small one `match:`, and (!) didn't hang the engine on the bad input. - -Please note the `attrReg.source` property. `RegExp` objects provide access to their source string in it. That's convenient when we want to insert one regexp into another. From 4232a53219f86a4eaed57cc810e2faeeeaa2c84d Mon Sep 17 00:00:00 2001 From: Ilya Kantor Date: Fri, 6 Sep 2019 16:48:59 +0300 Subject: [PATCH 5/7] WIP --- .../10-destructuring-assignment/article.md | 2 +- .../02-class-inheritance/article.md | 2 +- 2-ui/4-forms-controls/2-focus-blur/article.md | 23 +- .../01-regexp-introduction/article.md | 6 +- .../02-regexp-character-classes/article.md | 22 +- .../03-regexp-unicode/article.md | 4 +- .../07-regexp-escaping/article.md | 8 +- .../11-regexp-groups/article.md | 18 +- .../16-regexp-sticky/article.md | 142 ++++-- .../17-regexp-methods/article.md | 430 +++++++----------- 10 files changed, 315 insertions(+), 342 deletions(-) diff --git a/1-js/05-data-types/10-destructuring-assignment/article.md b/1-js/05-data-types/10-destructuring-assignment/article.md index dec2535a..c2288847 100644 --- a/1-js/05-data-types/10-destructuring-assignment/article.md +++ b/1-js/05-data-types/10-destructuring-assignment/article.md @@ -403,7 +403,7 @@ alert(item1); // Cake alert(item2); // Donut ``` -The whole `options` object except `extra` that was not mentioned, is assigned to corresponding variables: +All properties of `options` object except `extra` that is absent in the left part, are assigned to corresponding variables: ![](destructuring-complex.svg) diff --git a/1-js/09-classes/02-class-inheritance/article.md b/1-js/09-classes/02-class-inheritance/article.md index 108cc11f..6921698e 100644 --- a/1-js/09-classes/02-class-inheritance/article.md +++ b/1-js/09-classes/02-class-inheritance/article.md @@ -524,7 +524,7 @@ In the example below a non-method syntax is used for comparison. `[[HomeObject]] ```js run let animal = { - eat: function() { // should be the short syntax: eat() {...} + eat: function() { // intentially writing like this instead of eat() {... // ... } }; diff --git a/2-ui/4-forms-controls/2-focus-blur/article.md b/2-ui/4-forms-controls/2-focus-blur/article.md index 695b69c9..6a605bc0 100644 --- a/2-ui/4-forms-controls/2-focus-blur/article.md +++ b/2-ui/4-forms-controls/2-focus-blur/article.md @@ -106,22 +106,27 @@ The best recipe is to be careful when using these events. If we want to track us By default many elements do not support focusing. -The list varies between browsers, but one thing is always correct: `focus/blur` support is guaranteed for elements that a visitor can interact with: `