From fcee2c01e53ef984d7756b279ce32ba52aac02a9 Mon Sep 17 00:00:00 2001 From: DL6ER Date: Tue, 2 Feb 2021 10:02:08 +0100 Subject: [PATCH] Improve regex documentation Signed-off-by: DL6ER --- docs/core/pihole-command.md | 2 +- docs/database/ftl.md | 12 +-- docs/ftldns/compile.md | 4 +- docs/ftldns/configfile.md | 2 +- docs/regex/approximate.md | 97 +++++++++++++++++++ docs/{ftldns => }/regex/overview.md | 10 +- docs/regex/pi-hole.md | 144 ++++++++++++++++++++++++++++ docs/regex/testmode.md | 19 ++++ docs/{ftldns => }/regex/tutorial.md | 23 +++-- mkdocs.yml | 13 ++- package.json | 2 +- 11 files changed, 300 insertions(+), 28 deletions(-) create mode 100644 docs/regex/approximate.md rename docs/{ftldns => }/regex/overview.md (79%) create mode 100644 docs/regex/pi-hole.md create mode 100644 docs/regex/testmode.md rename docs/{ftldns => }/regex/tutorial.md (87%) diff --git a/docs/core/pihole-command.md b/docs/core/pihole-command.md index a8e714c..ba48daa 100644 --- a/docs/core/pihole-command.md +++ b/docs/core/pihole-command.md @@ -54,7 +54,7 @@ Example Usage | [`pihole -regex '^example.com$' '.*\.example2.net'`](https://d Administrators need to be able to manually add and remove domains for various purposes, and these commands serve that purpose. -See [Regex Blocking](../ftldns/regex/overview.md) for more information about using Regex. +See [Regex Blocking](../regex/overview.md) for more information about using Regex. **Basic Script Process**: diff --git a/docs/database/ftl.md b/docs/database/ftl.md index d5e615a..f750c9a 100644 --- a/docs/database/ftl.md +++ b/docs/database/ftl.md @@ -113,17 +113,17 @@ ID | Query Type ID | Status | | Details --- | --- | --- | --- 0 | Unknown | ❔ | was not answered by forward destination -1 | Blocked | ❌ | Domain contained in [gravity database](../database/gravity/index.md#gravity-table-gravity) +1 | Blocked | ❌ | Domain contained in [gravity database](gravity/index.md#gravity-table-gravity) 2 | Allowed | ✅ | Forwarded 3 | Allowed | ✅ | Known, replied to from cache -4 | Blocked | ❌ | Domain matched by a [regex blacklist](../database/gravity/index.md#regex-table-regex) filter -5 | Blocked | ❌ | Domain contained in [exact blacklist](../database/gravity/index.md#blacklist-table-blacklist) +4 | Blocked | ❌ | Domain matched by a [regex blacklist](gravity/index.md#regex-table-regex) filter +5 | Blocked | ❌ | Domain contained in [exact blacklist](gravity/index.md#blacklist-table-blacklist) 6 | Blocked | ❌ | By upstream server (known blocking page IP address) 7 | Blocked | ❌ | By upstream server (`0.0.0.0` or `::`) 8 | Blocked | ❌ | By upstream server (`NXDOMAIN` with `RA` bit unset) -9 | Blocked | ❌ | Domain contained in [gravity database](../database/gravity/index.md#gravity-table-gravity)
*Blocked during deep CNAME inspection* -10 | Blocked | ❌ | Domain matched by a [regex blacklist](../database/gravity/index.md#regex-table-regex) filter
*Blocked during deep CNAME inspection* -11 | Blocked | ❌ | Domain contained in [exact blacklist](../database/gravity/index.md#blacklist-table-blacklist)
*Blocked during deep CNAME inspection* +9 | Blocked | ❌ | Domain contained in [gravity database](gravity/index.md#gravity-table-gravity)
*Blocked during deep CNAME inspection* +10 | Blocked | ❌ | Domain matched by a [regex blacklist](gravity/index.md#regex-table-regex) filter
*Blocked during deep CNAME inspection* +11 | Blocked | ❌ | Domain contained in [exact blacklist](gravity/index.md#blacklist-table-blacklist)
*Blocked during deep CNAME inspection* ### Example for interaction with the long-term query database diff --git a/docs/ftldns/compile.md b/docs/ftldns/compile.md index 93a464e..a9e03ca 100644 --- a/docs/ftldns/compile.md +++ b/docs/ftldns/compile.md @@ -8,13 +8,13 @@ Install them by running the following command in a terminal: ### Debian / Ubuntu / Raspbian ```bash -sudo apt install build-essential libgmp-dev m4 cmake libidn11-dev libreadline-dev +sudo apt install build-essential libgmp-dev m4 cmake libidn11-dev libreadline-dev xxd ``` ### Fedora ```bash -sudo dnf install gcc gmp-devel gmp-static m4 cmake libidn-devel readline-devel +sudo dnf install gcc gmp-devel gmp-static m4 cmake libidn-devel readline-devel xxd ``` ## Compile `libnettle` from source diff --git a/docs/ftldns/configfile.md b/docs/ftldns/configfile.md index 3b59853..a122fbf 100755 --- a/docs/ftldns/configfile.md +++ b/docs/ftldns/configfile.md @@ -206,7 +206,7 @@ Controls if *FTL*DNS should print extended details about regex matching into `pi Due to legacy reasons, we also support the following setting to be used for enabling the same functionality:
`REGEX_DEBUGMODE=false|true` Note that if one of them is set to `true`, the other one cannot be used to disable this setting again.
-**[More details](regex/overview.md)** +**[More details](../regex/overview.md)** #### `DEBUG_API=false|true` {#debug_api data-toc-label='Telnet'} diff --git a/docs/regex/approximate.md b/docs/regex/approximate.md new file mode 100644 index 0000000..097d906 --- /dev/null +++ b/docs/regex/approximate.md @@ -0,0 +1,97 @@ +# Approximative matching + +You may or not be know `agrep`. It is basically a "forgiving" `grep` and is, for instance, used for searching through (offline) dictionaries. It is tolerant against errors (up to degree you specify). It may be beneficial is you want to match against domains where you don't really know the pattern. It is just an idea, we will have to see if it is actually useful. + +This is a somewhat complicated topic, we'll approach it by examples as it is very complicated to get the head around it by just listening to the specifications. + +The approximate matching settings for a subpattern can be changed by appending *approx-settings* to the subpattern. Limits for the number of errors can be set and an expression for specifying and limiting the costs can be given: + +## Accepted **insertions** (`+`) + +Use `(something){+x}` to specify that the regex should still be matching when `x` characters would need it be *inserted* into the sub-expression `something`. + +Example: + +- `doubleclick.net` is matched by `^doubleclick\.(nt){+1}$` + +The missing `e` in `nt` is inserted. + +Similarly: + +- `doubleclick.net` is matched by `^(doubleclk\.nt){+3}$` + +The missing characters in the domain are substituted. The maximum number of insertions spans the entire domain as is wrapped in the sub-expression `(...)`. + +## Accepted **deletions** (`-`) + +Use `(something){-x}` to specify that the regex should still be matching when `x` characters would need it be *deleted* from the sub-expression `something`: + +Example: + +- `doubleclick.net` is matched by `^doubleclick\.(neet){-1}$` + +The surplus `e` in `neet` is deleted. + +Similarly: + +- `doubleclick.net` is matched by `^(doubleclicky\.netty){-3}$` +- `doubleclick.net` is NOT matched by `^(doubleclicky\.nettfy){-3}$` + +## Accepted **substitutions** (`#`) + +Use `(something){#x}` to specify that the regex should still be matching when `x` characters would need to be *substituted* from the sub-expression `something`: + +Example 1: + +- `oobargoobaploowap` is matched by `(foobar){#2~2}` +Hint: `goobap` is `foobar` with two substitutions `f->g` and `r->p` + +Example 2: + +- `doubleclick.net` is matched by `^doubleclick\.n(tt){#1}$` + +The incorrect `t` in `ntt` is substituted. Note that substitutions are necessary when a character needs to be replaced as the corresponding realization with one insertion and one deletion is **not identical**: + +`doubleclick.net` is matched by `^doubleclick\.n(tt){+1-1}$` + +(`t` is removed, `e` is added), however + +- `doubleclick.nt` is ALSO matched by `^doubleclick\.n(tt){+1-1}$` + +(the `t` is just removed, nothing had to be added) but + +- `doubleclick.nt` is NOT matched by `^doubleclick\.n(tt){#1}$` + +doesn't match as substitutions always require characters to be swapped by others. + +## Combinations and total error limit (`~`) + +All rules from above can be combined like as `{+2-5#6}` allowing (up to!) two insertions, five deletions, and six substitutions. You can enforce an upper limit on the number of tried realizations using the tilde. Even when `{+2-5#6}` can lead to up to 13 operations being tried, this can be limited to (at most) seven tries using `{+2-5#6~7}`. + +Example: + +- `oobargoobploowap` is matched by `(foobar){+2#2~3}` + + Hint: `goobaap` is `foobar` with + - two substitutions `f->g` and `r->p`, and + - one addition `a` between `bar` (to have `baap`) + +Specifying `~2` instead of `~3` will lead to no match as three errors need to be corrected in total for a match in this example. + +## Advanced topic: Cost-equation + +You can even weight the "costs" of insertions, deletions or substitutions. This is really an advanced topic and should only be touched when really needed. + +A *cost-equation* can be thought of as a mathematical equation, where `i`, `d`, and `s` stand for the number of insertions, deletions, and substitutions, respectively. The equation can have a multiplier for each of `i`, `d`, and `s`. +The multiplier is the **cost of the error**, and the number after `<` is the maximum allowed total cost of a match. Spaces and pluses can be inserted to make the equation more readable. When specifying only a cost equation, adding a space after the opening `{` is **required** . + +Example 1: `{ 2i + 1d + 2s < 5 }` + +This sets the cost of an insertion to two, a deletion to one, a substitution to two, and the maximum cost to five. + +Example 2: `{+2-5#6, 2i + 1d + 2s < 5 }` + +This sets the cost of an insertion to two, a deletion to one, a substitution to two, and the maximum cost to five. Furthermore, it allows only up to 2 insertions (coming at a total cost of 4), five deletions and up to 6 substitutions. As six substitutions would come at a cost of `6*2 = 12`, exeeding the total allowed costs of 5, they cannot all be realized. + + +{!abbreviations.md!} diff --git a/docs/ftldns/regex/overview.md b/docs/regex/overview.md similarity index 79% rename from docs/ftldns/regex/overview.md rename to docs/regex/overview.md index ae5a932..d09faa4 100644 --- a/docs/ftldns/regex/overview.md +++ b/docs/regex/overview.md @@ -1,7 +1,7 @@ -A regular expression, or RegEx for short, is a pattern that can be used for building arbitrarily complex filter rules in *FTL*DNS. -We implement the POSIX Extended Regular Expressions similar to the one used by the UNIX `egrep` (or `grep -E`) command. +A regular expression, or RegEx for short, is a pattern that **can be used for building arbitrarily complex filter** rules in *FTL*DNS. +We implement the POSIX Extended Regular Expressions similar to the one used by the UNIX `egrep` (or `grep -E`) command. We amend the regex engine by approximate blocking (compare to `agrep`) and other special features like matching to specific query types only. -Our implementation is light and fast as each domain is only checked once for a match (if you query `google.com`, it will be checked against your RegEx. Any subsequent query to the same domain will not be checked again until you restart `pihole-FTL`). +Our implementation is light and fast as each domain is only checked once for a match. When you query `google.com`, it will be checked against your RegEx. Any subsequent query to the same domain will not be checked again until you restart `pihole-FTL`. ## Hierarchy of regex filters in *FTL*DNS @@ -13,7 +13,7 @@ There are two locations where regex filters are important: ## How to use regular expressions for filtering domains -*FTL*DNS reads in regular expression filters from the two [`regex` database views](../../database/gravity/index.md). +*FTL*DNS reads in regular expression filters from the two [`regex` database views](../database/gravity/index.md). To tell *FTL*DNS to reload the list of regex filters, either: - Execute `pihole restartdns reload-lists` or @@ -26,7 +26,7 @@ The first command is to be preferred as it ensures that the DNS cache itself rem To ease the usage of regular expression filters in *FTL*DNS, we offer a regex debugging mode. Set -``` +``` plain DEBUG_REGEX=true ``` diff --git a/docs/regex/pi-hole.md b/docs/regex/pi-hole.md new file mode 100644 index 0000000..027ac87 --- /dev/null +++ b/docs/regex/pi-hole.md @@ -0,0 +1,144 @@ +# Pi-hole regex extensions + +## Only match specific query types + +You can amend the regular expressions by special keywords added at the end to fine-tine regular expressions to match only specific query types. + +Example: + +``` plain +abc;querytype=AAAA +``` + +will block + +``` bash +dig AAAA abc +``` + +but not + +``` bash +dig A abc +``` + +This allows you to do query type based black-/whitelisting. Some user-provided examples are: + +- `.*;querytype=!A` + + A regex blacklist entry for blocking `AAAA` (in fact, everything else than `A`, call it "anti-`A`") requests for all clients assigned to the same group. This has been mentioned to be benefitial for devices like Chromecast. You may want to fine-tune this further to specific domains. + +- `.*;querytype=PTR` + + A regex whitelist entry used to permit `PTR` lookups with the above "anti-`A`" regex + +- `.*;querytype=ANY` + + A regex blacklist entry to block `ANY` request network wide. + +## Invert matching + +Sometimes, it may be useful to be able to invert a regular expression altogether. Hence, we added the keyword `;invert` to achieve exactly this. + +For instance, + +``` plain +^abc$;querytype=AAAA;invert +``` + +will not block `abc` with type `AAAA` (but everything else) for the clients assigned to the same groups. This inversion is independent for the query type, e.g. + +``` plain +^abc$;invert +``` + +will block **not** block `abc` but **everything else**. + +## Comments + +You can specify comments withing your regex using the syntax + +``` plain +(?#some comment here) +``` + +The comment can contain any characters except for a closing parenthesis `)` (for the sole reason being the terminating element). The text in the comment is completely ignored by the regex parser and it used solely for readability purposes. + +``` plain +$ pihole-FTL regex-test "doubleclick.net" "(^|\.)doubleclick\.(?#TODO: We need to maybe support more than just .net here)net$" + +FTL Regex test: + Domain: "doubleclick.net" + Regex: "(^|\.)doubleclick\.(?#TODO: We need to maybe support more than just .net here)net$" +Step 1: Compiling regex filter... + Compiled regex filter in 0.167 msec +Step 2: Checking domain... + Done in 0.032 msec + +MATCH +``` + +## Back-references + +A back reference is a backslash followed by a single non-zero decimal digit `d`. It matches *the same sequence* of characters matched by the `d`th parenthesized subexpression. + +Example: + +``` plain +"cat.foo.dog---cat%dog!foo" is matched by "(cat)\.(foo)\.(dog)---\1%\3!\2" +``` + +Another (more complex example is): + +``` plain +(1234|4321)\.(foo)\.(dog)--\1 +``` + +``` plain + MATCH: 1234.foo.dog--1234 + MATCH: 4321.foo.dog--4321 +NO MATCH: 1234.foo.dog--4321 +``` + +Mind that the last line gives no match as `\1` matches **exactly** the same sequence the first character group matched. And `4321` is not the same as `1234` even when both are valid replies for `(1234|4321)` Back references are not defined for POSIX EREs (for BREs they are, surprisingly enough). We add them to ERE in the BRE style. + +``` plain +$ pihole-FTL regex-test "someverylongandmaybecomplexthing.foo.dog--someverylongandmaybecomplexthing" "(someverylongandmaybecomplexthing|somelesscomplexitem)\.(foo)\.(dog)--\1" + +FTL Regex test: + Domain: "someverylongandmaybecomplexthing.foo.dog--someverylongandmaybecomplexthing" + Regex: "(someverylongandmaybecomplexthing|somelesscomplexitem)\.(foo)\.(dog)--\1" +Step 1: Compiling regex filter... + Compiled regex filter in 0.563 msec +Step 2: Checking domain... + Done in 0.031 msec + +MATCH +``` + +## More character classes for bracket expressions + +A bracket expression specifies a set of characters by enclosing a nonempty list of items in brackets. Normally anything matching any item in the list is matched. If the list begins with `^` the meaning is negated; any character matching no item in the list is matched. + +1. Multiple characters: `[abc]` matches `a`, `b`, and `c`. +2. Character ranges: `[0-9]` matches any decimal digit. +3. Character classes: + - `[:alnum:]` alphanumeric characters + - `[:alpha:]` alphabetic characters + - `[:blank:]` blank characters + - `[:cntrl:]` control characters + - `[:digit:]` decimal digits (0 - 9) + - `[:graph:]` all printable characters except space + - `[:lower:]` lower-case letters (FTL matches case-insensitive by default) + - `[:print:]` printable characters including space + - `[:punct:]` printable characters not space or alphanumeric + - `[:space:]` white-space characters + - `[:upper:]` upper case letters (FTL matches case-insensitive by default) + - `[:xdigit:]` hexadecimal digits + +Furthermore, there are two shortcurts for some character classes: + +- `\d` - Digit character (equivalent to `[[:digit:]]`) +- `\D` - Non-digit character (equivalent to `[^[:digit:]]`) + +{!abbreviations.md!} diff --git a/docs/regex/testmode.md b/docs/regex/testmode.md new file mode 100644 index 0000000..f141df7 --- /dev/null +++ b/docs/regex/testmode.md @@ -0,0 +1,19 @@ +# Regex Test mode + +In order to ease regex development, we added a regex test mode to `pihole-FTL` which can be invoked like + +``` bash +pihole-FTL regex-test doubleclick.net +``` + +(test `doubleclick.net` against all regexs in the gravity database), or + +``` bash +pihole-FTL regex-test doubleclick.net "(^|\.)double" +``` + +(test `doubleclick.net` against the CLI-provided regex `(^|\.)double`. + +You do NOT need to be `sudo` for this, any arbitrary user should be able to run this command. The test returns `0` on match and `1` on no match and errors, hence, it may be used for scripting. + +{!abbreviations.md!} diff --git a/docs/ftldns/regex/tutorial.md b/docs/regex/tutorial.md similarity index 87% rename from docs/ftldns/regex/tutorial.md rename to docs/regex/tutorial.md index d96962f..0ca66e9 100644 --- a/docs/ftldns/regex/tutorial.md +++ b/docs/regex/tutorial.md @@ -89,13 +89,20 @@ Example | Interpretation In addition to character groups, there are also some special character classes available, such as -Character class | Group equivalent | Interpretation ---- | --- | --- -`[:digit:]` | `[0-9]` | matches digits -`[:lower:]` | `[a-z]` | matched lowercase letters -`[:upper:]` | `[A-Z]` | matched uppercase letters -`[:alpha:]` | `[A-Za-z]` | matches alphabetic characters -`[:alnum:]` | `[A-Za-z0-9]` | matches alphabetic characters and digits +Character class | Group equivalent | Pi-hole specific | Interpretation +--------------- | ---------------- | ---------------- | --------------- +`[:digit:]` | `[0-9]` | No | matches digits +`[:lower:]` | `[a-z]` | No | matched lowercase letters(FTL matches case-insensitive by default) +`[:upper:]` | `[A-Z]` | No | matched uppercase letters(FTL matches case-insensitive by default) +`[:alpha:]` | `[A-Za-z]` | No | matches alphabetic characters +`[:alnum:]` | `[A-Za-z0-9]` | No | matches alphabetic characters and digits +`[:blank:]` | `[ \t]` | Yes | blank characters +`[:cntrl:]` | N/A | Yes | control characters +`[:graph:]` | N/A | Yes | all printable characters except space +`[:print:]` | N/A | Yes | printable characters including space +`[:punct:]` | N/A | Yes | printable characters not space or alphanumeric +`[:space:]` | `[ \f\n\r\t\v]` | Yes | white-space characters +`[:xdigit:]` | `[0-9a-fA-F]` | Yes | hexadecimal digits # Advanced examples @@ -107,7 +114,7 @@ After going through our quick tutorial, we provide some more advanced examples s ^[0-9][^a-z]+\.((com)|(edu))$ ``` -Blocks domains containing only numbers (no letters) and ending in `.com` or `.edu`. Blocks `555661.com`, and `456.edu`, but not `555g555.com` +Blocks domains containing only numbers (no letters) and ending in `.com` or `.edu`. This blocks `555661.com`, and `456.edu`, but not `555g555.com` ### Block domains without subdomains diff --git a/mkdocs.yml b/mkdocs.yml index 2c9952e..f8e4264 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -79,9 +79,6 @@ nav: - 'DNS resolver': ftldns/dns-resolver.md - 'DNS cache': ftldns/dns-cache.md - 'Blocking mode': ftldns/blockingmode.md - - 'RegEx blocking': - - "Overview": ftldns/regex/overview.md - - "Tutorial": ftldns/regex/tutorial.md - 'Privacy levels': ftldns/privacylevels.md - 'Telnet API': ftldns/telnet-api.md - 'Signals': 'ftldns/signals.md' @@ -89,6 +86,12 @@ nav: - 'Install from source': ftldns/compile.md - 'Debugging FTLDNS': ftldns/debugging.md - 'In-depth manual': ftldns/in-depth.md + - 'RegEx blocking': + - "Overview": regex/overview.md + - "Testing": regex/testmode.md + - "Tutorial": regex/tutorial.md + - "Pi-hole extensions": regex/pi-hole.md + - "Approximate matching": regex/approximate.md - 'Docker': - 'DHCP': docker/DHCP.md - 'Contributing': @@ -158,7 +161,6 @@ plugins: - redirects: redirect_maps: 'ftldns/database.md': database/index.md - 'ftldns/regex/index.md': ftldns/regex/overview.md 'main/presentations.md': index.md 'main/prerequesites.md': main/prerequisites.md 'guides/unbound.md': guides/dns/unbound.md @@ -185,3 +187,6 @@ plugins: 'guides/nginx-configuration.md': 'guides/webserver/nginx.md' 'guides/caddy-configuration.md': 'guides/webserver/caddy.md' 'guides/traefik-configuration-nodocker.md': 'guides/webserver/traefik-nodocker.md' + 'ftldns/regex/index.md': regex/overview.md + 'ftldns/regex/overview.md': regex/overview.md + 'ftldns/regex/tutorial.md': regex/tutorial.md diff --git a/package.json b/package.json index a45d561..3556f06 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "scripts": { "build": "mkdocs build --clean", "markdownlint": "markdownlint-cli2 \"**/*.md\" \"!**/node_modules/**\"", - "linkinator": "linkinator site --recurse --silent --skip \"^(?!http://localhost)\"", + "linkinator": "npm run build && linkinator site --recurse --silent --skip \"^(?!http://localhost)\"", "test": "npm run markdownlint && npm run linkinator" }, "devDependencies": {