From 9935a20740d0b47a700f67e19bfaa00668b116c7 Mon Sep 17 00:00:00 2001 From: tsutsu3 Date: Tue, 20 Oct 2020 02:32:20 +0900 Subject: [PATCH 1/5] Add simple typographic replacements --- markdown_it/parser_core.py | 4 +- markdown_it/rules_core/__init__.py | 1 + markdown_it/rules_core/replacements.js | 107 ------------------------ markdown_it/rules_core/replacements.py | 100 ++++++++++++++++++++++ tests/test_api/test_main.py | 2 +- tests/test_port/fixtures/typographer.md | 13 +++ tests/test_port/test_fixtures.py | 10 +++ 7 files changed, 127 insertions(+), 110 deletions(-) delete mode 100644 markdown_it/rules_core/replacements.js create mode 100644 markdown_it/rules_core/replacements.py diff --git a/markdown_it/parser_core.py b/markdown_it/parser_core.py index c69a0510..331ed153 100644 --- a/markdown_it/parser_core.py +++ b/markdown_it/parser_core.py @@ -8,7 +8,7 @@ from .ruler import Ruler from .rules_core.state_core import StateCore -from .rules_core import normalize, block, inline +from .rules_core import normalize, block, inline, replace # TODO linkify, replacements, smartquotes _rules = [ @@ -16,7 +16,7 @@ ["block", block], ["inline", inline], # [ 'linkify', require('./rules_core/linkify') ], - # [ 'replacements', require('./rules_core/replacements') ], + ['replacements', replace], # [ 'smartquotes', require('./rules_core/smartquotes') ] ] diff --git a/markdown_it/rules_core/__init__.py b/markdown_it/rules_core/__init__.py index a5b8764e..72757f51 100644 --- a/markdown_it/rules_core/__init__.py +++ b/markdown_it/rules_core/__init__.py @@ -2,3 +2,4 @@ from .normalize import normalize # noqa: F401 from .block import block # noqa: F401 from .inline import inline # noqa: F401 +from .replacements import replace # noqa: F401 diff --git a/markdown_it/rules_core/replacements.js b/markdown_it/rules_core/replacements.js deleted file mode 100644 index 533496fa..00000000 --- a/markdown_it/rules_core/replacements.js +++ /dev/null @@ -1,107 +0,0 @@ -// Simple typographic replacements -// -// (c) (C) → © -// (tm) (TM) → ™ -// (r) (R) → ® -// +- → ± -// (p) (P) -> § -// ... → … (also ?.... → ?.., !.... → !..) -// ???????? → ???, !!!!! → !!!, `,,` → `,` -// -- → –, --- → — -// -'use strict'; - -// TODO: -// - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾ -// - miltiplication 2 x 4 -> 2 × 4 - -var RARE_RE = /\+-|\.\.|\?\?\?\?|!!!!|,,|--/; - -// Workaround for phantomjs - need regex without /g flag, -// or root check will fail every second time -var SCOPED_ABBR_TEST_RE = /\((c|tm|r|p)\)/i; - -var SCOPED_ABBR_RE = /\((c|tm|r|p)\)/ig; -var SCOPED_ABBR = { - c: '©', - r: '®', - p: '§', - tm: '™' -}; - -function replaceFn(match, name) { - return SCOPED_ABBR[name.toLowerCase()]; -} - -function replace_scoped(inlineTokens) { - var i, token, inside_autolink = 0; - - for (i = inlineTokens.length - 1; i >= 0; i--) { - token = inlineTokens[i]; - - if (token.type === 'text' && !inside_autolink) { - token.content = token.content.replace(SCOPED_ABBR_RE, replaceFn); - } - - if (token.type === 'link_open' && token.info === 'auto') { - inside_autolink--; - } - - if (token.type === 'link_close' && token.info === 'auto') { - inside_autolink++; - } - } -} - -function replace_rare(inlineTokens) { - var i, token, inside_autolink = 0; - - for (i = inlineTokens.length - 1; i >= 0; i--) { - token = inlineTokens[i]; - - if (token.type === 'text' && !inside_autolink) { - if (RARE_RE.test(token.content)) { - token.content = token.content - .replace(/\+-/g, '±') - // .., ..., ....... -> … - // but ?..... & !..... -> ?.. & !.. - .replace(/\.{2,}/g, '…').replace(/([?!])…/g, '$1..') - .replace(/([?!]){4,}/g, '$1$1$1').replace(/,{2,}/g, ',') - // em-dash - .replace(/(^|[^-])---(?=[^-]|$)/mg, '$1\u2014') - // en-dash - .replace(/(^|\s)--(?=\s|$)/mg, '$1\u2013') - .replace(/(^|[^-\s])--(?=[^-\s]|$)/mg, '$1\u2013'); - } - } - - if (token.type === 'link_open' && token.info === 'auto') { - inside_autolink--; - } - - if (token.type === 'link_close' && token.info === 'auto') { - inside_autolink++; - } - } -} - - -module.exports = function replace(state) { - var blkIdx; - - if (!state.md.options.typographer) { return; } - - for (blkIdx = state.tokens.length - 1; blkIdx >= 0; blkIdx--) { - - if (state.tokens[blkIdx].type !== 'inline') { continue; } - - if (SCOPED_ABBR_TEST_RE.test(state.tokens[blkIdx].content)) { - replace_scoped(state.tokens[blkIdx].children); - } - - if (RARE_RE.test(state.tokens[blkIdx].content)) { - replace_rare(state.tokens[blkIdx].children); - } - - } -}; diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py new file mode 100644 index 00000000..6956f660 --- /dev/null +++ b/markdown_it/rules_core/replacements.py @@ -0,0 +1,100 @@ +"""Simple typographic replacements + +(c) (C) → © +(tm) (TM) → ™ +(r) (R) → ® ++- → ± +(p) (P) -> § +... → … (also ?.... → ?.., !.... → !..) +???????? → ???, !!!!! → !!!, `,,` → `,` +-- → –, --- → — +""" +import logging +import re +from typing import List + +from .state_core import StateCore +from ..token import Token + +LOGGER = logging.getLogger(__name__) + +# TODO: +# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾ +# - miltiplication 2 x 4 -> 2 × 4 + +RARE_RE = r"\+-|\.\.|\?\?\?\?|!!!!|,,|--" + +# Workaround for phantomjs - need regex without /g flag, +# or root check will fail every second time +# SCOPED_ABBR_TEST_RE = r"\((c|tm|r|p)\)" + +SCOPED_ABBR_RE = r"\((c|tm|r|p)\)" + +SCOPED_ABBR = { + "c": "©", + "r": "®", + "p": "§", + "tm": "™" +} + + +def replaceFn(match: re.Match): + return SCOPED_ABBR[match.group(1).lower()] + + +def replace_scoped(inlineTokens: List[Token]): + inside_autolink = 0 + + for token in inlineTokens: + if token.type == "text" and not inside_autolink: + token.content = re.sub(SCOPED_ABBR_RE, replaceFn, token.content, flags=re.IGNORECASE) + + if token.type == "link_open" and token.info == "auto": + inside_autolink -= 1 + + if token.type == "link_close" and token.info == "auto": + inside_autolink += 1 + + +def replace_rare(inlineTokens: List[Token]): + inside_autolink = 0 + + for token in inlineTokens: + if token.type == "text" and not inside_autolink: + if re.search(RARE_RE, token.content): + token.content = re.sub(r"\+-", "±", token.content) + # .., ..., ....... -> … + # but ?..... & !..... -> ?.. & !.. + token.content = re.sub(r"\.{2,}", "…", token.content) + token.content = re.sub(r"([?!])…", "\\1..", token.content) + token.content = re.sub(r"([?!]){4,}", "\\1\\1\\1", token.content) + token.content = re.sub(r",{2,}", ",", token.content) + # em-dash + token.content = re.sub(r"(^|[^-])---(?=[^-]|$)", + "\\1\u2014", token.content, flags=re.MULTILINE) + # en-dash + token.content = re.sub(r"(^|\s)--(?=\s|$)", "\\1\u2013", + token.content, flags=re.MULTILINE) + token.content = re.sub(r"(^|[^-\s])--(?=[^-\s]|$)", + "\\1\u2013", token.content, flags=re.MULTILINE) + + if token.type == "link_open" and token.info == "auto": + inside_autolink -= 1 + + if token.type == "link_close" and token.info == "auto": + inside_autolink += 1 + + +def replace(state: StateCore): + if not state.md.options.typographer: + return + + for token in state.tokens: + if token.type != "inline": + continue + + if re.search(SCOPED_ABBR_RE, token.content, flags=re.IGNORECASE): + replace_scoped(token.children) + + if re.search(RARE_RE, token.content): + replace_rare(token.children) diff --git a/tests/test_api/test_main.py b/tests/test_api/test_main.py index 575a246e..3eebdf60 100644 --- a/tests/test_api/test_main.py +++ b/tests/test_api/test_main.py @@ -7,7 +7,7 @@ def test_get_rules(): md = MarkdownIt("zero") # print(md.get_all_rules()) assert md.get_all_rules() == { - "core": ["normalize", "block", "inline"], + "core": ["normalize", "block", "inline", "replacements"], "block": [ "table", "code", diff --git a/tests/test_port/fixtures/typographer.md b/tests/test_port/fixtures/typographer.md index e9e2bfee..3ae8fd86 100644 --- a/tests/test_port/fixtures/typographer.md +++ b/tests/test_port/fixtures/typographer.md @@ -4,6 +4,19 @@

(bad)

. +copyright (Lower) +. +(c) +. +

©

+. + +copyright (Upper) +. +(C) +. +

©

+. copyright . diff --git a/tests/test_port/test_fixtures.py b/tests/test_port/test_fixtures.py index 1bc74220..11c93b71 100644 --- a/tests/test_port/test_fixtures.py +++ b/tests/test_port/test_fixtures.py @@ -8,6 +8,16 @@ FIXTURE_PATH = Path(__file__).parent.joinpath("fixtures") +@pytest.mark.parametrize( + "line,title,input,expected", read_fixture_file(FIXTURE_PATH.joinpath("typographer.md")) +) +def test_typographer(line, title, input, expected): + md = MarkdownIt().enable("replacements") + md.options["typographer"] = True + text = md.render(input) + assert text.rstrip() == expected.rstrip() + + @pytest.mark.parametrize( "line,title,input,expected", read_fixture_file(FIXTURE_PATH.joinpath("tables.md")) ) From bb9b840dcea3c1e219ddefcd69dbfe580c77e362 Mon Sep 17 00:00:00 2001 From: tsutsu3 Date: Tue, 20 Oct 2020 15:10:31 +0900 Subject: [PATCH 2/5] Fix format --- markdown_it/parser_core.py | 2 +- markdown_it/rules_core/replacements.py | 36 +++++++++++++++----------- tests/test_port/test_fixtures.py | 3 ++- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/markdown_it/parser_core.py b/markdown_it/parser_core.py index 331ed153..98ce7aa2 100644 --- a/markdown_it/parser_core.py +++ b/markdown_it/parser_core.py @@ -16,7 +16,7 @@ ["block", block], ["inline", inline], # [ 'linkify', require('./rules_core/linkify') ], - ['replacements', replace], + ["replacements", replace], # [ 'smartquotes', require('./rules_core/smartquotes') ] ] diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py index 6956f660..ef88805c 100644 --- a/markdown_it/rules_core/replacements.py +++ b/markdown_it/rules_core/replacements.py @@ -11,7 +11,7 @@ """ import logging import re -from typing import List +from typing import List, Match from .state_core import StateCore from ..token import Token @@ -30,15 +30,10 @@ SCOPED_ABBR_RE = r"\((c|tm|r|p)\)" -SCOPED_ABBR = { - "c": "©", - "r": "®", - "p": "§", - "tm": "™" -} +SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": "™"} -def replaceFn(match: re.Match): +def replaceFn(match: Match): return SCOPED_ABBR[match.group(1).lower()] @@ -47,7 +42,9 @@ def replace_scoped(inlineTokens: List[Token]): for token in inlineTokens: if token.type == "text" and not inside_autolink: - token.content = re.sub(SCOPED_ABBR_RE, replaceFn, token.content, flags=re.IGNORECASE) + token.content = re.sub( + SCOPED_ABBR_RE, replaceFn, token.content, flags=re.IGNORECASE + ) if token.type == "link_open" and token.info == "auto": inside_autolink -= 1 @@ -70,13 +67,22 @@ def replace_rare(inlineTokens: List[Token]): token.content = re.sub(r"([?!]){4,}", "\\1\\1\\1", token.content) token.content = re.sub(r",{2,}", ",", token.content) # em-dash - token.content = re.sub(r"(^|[^-])---(?=[^-]|$)", - "\\1\u2014", token.content, flags=re.MULTILINE) + token.content = re.sub( + r"(^|[^-])---(?=[^-]|$)", + "\\1\u2014", + token.content, + flags=re.MULTILINE, + ) # en-dash - token.content = re.sub(r"(^|\s)--(?=\s|$)", "\\1\u2013", - token.content, flags=re.MULTILINE) - token.content = re.sub(r"(^|[^-\s])--(?=[^-\s]|$)", - "\\1\u2013", token.content, flags=re.MULTILINE) + token.content = re.sub( + r"(^|\s)--(?=\s|$)", "\\1\u2013", token.content, flags=re.MULTILINE + ) + token.content = re.sub( + r"(^|[^-\s])--(?=[^-\s]|$)", + "\\1\u2013", + token.content, + flags=re.MULTILINE, + ) if token.type == "link_open" and token.info == "auto": inside_autolink -= 1 diff --git a/tests/test_port/test_fixtures.py b/tests/test_port/test_fixtures.py index 11c93b71..68c09fb7 100644 --- a/tests/test_port/test_fixtures.py +++ b/tests/test_port/test_fixtures.py @@ -9,7 +9,8 @@ @pytest.mark.parametrize( - "line,title,input,expected", read_fixture_file(FIXTURE_PATH.joinpath("typographer.md")) + "line,title,input,expected", + read_fixture_file(FIXTURE_PATH.joinpath("typographer.md")), ) def test_typographer(line, title, input, expected): md = MarkdownIt().enable("replacements") From 523278b3028b7e8bc0e7a89f8f35563e3a528ca8 Mon Sep 17 00:00:00 2001 From: tsutsu3 Date: Tue, 20 Oct 2020 17:58:42 +0900 Subject: [PATCH 3/5] Change to pre-compile when using regex --- markdown_it/rules_core/replacements.py | 68 +++++++++++++++---------- tests/test_port/fixtures/typographer.md | 8 +++ 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py index ef88805c..dc5ead3b 100644 --- a/markdown_it/rules_core/replacements.py +++ b/markdown_it/rules_core/replacements.py @@ -22,13 +22,30 @@ # - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾ # - miltiplication 2 x 4 -> 2 × 4 -RARE_RE = r"\+-|\.\.|\?\?\?\?|!!!!|,,|--" +RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--") # Workaround for phantomjs - need regex without /g flag, # or root check will fail every second time # SCOPED_ABBR_TEST_RE = r"\((c|tm|r|p)\)" -SCOPED_ABBR_RE = r"\((c|tm|r|p)\)" +SCOPED_ABBR_RE = re.compile(r"\((c|tm|r|p)\)", flags=re.IGNORECASE) + +PLUS_MINUS_RE = re.compile(r"\+-") + +ELLIPSIS_RE = re.compile(r"\.{2,}") + +ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…") + +QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}") + +COMMA_RE = re.compile(r",{2,}") + +EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE) + +EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE) + +EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE) + SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": "™"} @@ -42,9 +59,7 @@ def replace_scoped(inlineTokens: List[Token]): for token in inlineTokens: if token.type == "text" and not inside_autolink: - token.content = re.sub( - SCOPED_ABBR_RE, replaceFn, token.content, flags=re.IGNORECASE - ) + token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content) if token.type == "link_open" and token.info == "auto": inside_autolink -= 1 @@ -58,31 +73,28 @@ def replace_rare(inlineTokens: List[Token]): for token in inlineTokens: if token.type == "text" and not inside_autolink: - if re.search(RARE_RE, token.content): - token.content = re.sub(r"\+-", "±", token.content) + if RARE_RE.search(token.content): + # +- -> ± + token.content = PLUS_MINUS_RE.sub("±", token.content) + # .., ..., ....... -> … + token.content = ELLIPSIS_RE.sub("…", token.content) + # but ?..... & !..... -> ?.. & !.. - token.content = re.sub(r"\.{2,}", "…", token.content) - token.content = re.sub(r"([?!])…", "\\1..", token.content) - token.content = re.sub(r"([?!]){4,}", "\\1\\1\\1", token.content) - token.content = re.sub(r",{2,}", ",", token.content) - # em-dash - token.content = re.sub( - r"(^|[^-])---(?=[^-]|$)", - "\\1\u2014", - token.content, - flags=re.MULTILINE, + token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub( + "\\1..", token.content ) + token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content) + + # ,, ,,, ,,,, -> , + token.content = COMMA_RE.sub(",", token.content) + + # em-dash + token.content = EM_DASH_RE.sub("\\1\u2014", token.content) + # en-dash - token.content = re.sub( - r"(^|\s)--(?=\s|$)", "\\1\u2013", token.content, flags=re.MULTILINE - ) - token.content = re.sub( - r"(^|[^-\s])--(?=[^-\s]|$)", - "\\1\u2013", - token.content, - flags=re.MULTILINE, - ) + token.content = EN_DASH_RE.sub("\\1\u2013", token.content) + token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content) if token.type == "link_open" and token.info == "auto": inside_autolink -= 1 @@ -99,8 +111,8 @@ def replace(state: StateCore): if token.type != "inline": continue - if re.search(SCOPED_ABBR_RE, token.content, flags=re.IGNORECASE): + if SCOPED_ABBR_RE.search(token.content): replace_scoped(token.children) - if re.search(RARE_RE, token.content): + if RARE_RE.search(token.content): replace_rare(token.children) diff --git a/tests/test_port/fixtures/typographer.md b/tests/test_port/fixtures/typographer.md index 3ae8fd86..39154ed0 100644 --- a/tests/test_port/fixtures/typographer.md +++ b/tests/test_port/fixtures/typographer.md @@ -74,6 +74,14 @@ dupes . +dupes-ellipsis +. +!... ?... ,... !!!!!!.... ????.... ,,... +. +

!.. ?.. ,… !!!.. ???.. ,…

+. + + dashes . ---markdownit --- super--- From e38bbbdeb265b3bf4f8efe4f48afe901c17077af Mon Sep 17 00:00:00 2001 From: tsutsu3 Date: Tue, 20 Oct 2020 18:48:11 +0900 Subject: [PATCH 4/5] Fix sphinx warnings --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 9488b636..142a3a6e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -45,6 +45,8 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +nitpick_ignore = [("py:class", "Match")] + # -- Options for HTML output ------------------------------------------------- From b07d778df2a27172faea10a246c77635089eb919 Mon Sep 17 00:00:00 2001 From: tsutsu3 Date: Tue, 20 Oct 2020 23:02:10 +0900 Subject: [PATCH 5/5] Fix sphinx --- markdown_it/rules_core/replacements.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/markdown_it/rules_core/replacements.py b/markdown_it/rules_core/replacements.py index dc5ead3b..86e3c3c4 100644 --- a/markdown_it/rules_core/replacements.py +++ b/markdown_it/rules_core/replacements.py @@ -1,13 +1,18 @@ """Simple typographic replacements -(c) (C) → © -(tm) (TM) → ™ -(r) (R) → ® -+- → ± -(p) (P) -> § -... → … (also ?.... → ?.., !.... → !..) -???????? → ???, !!!!! → !!!, `,,` → `,` --- → –, --- → — +* ``(c)``, ``(C)`` → © +* ``(tm)``, ``(TM)`` → ™ +* ``(r)``, ``(R)`` → ® +* ``(p)``, ``(P)`` → § +* ``+-`` → ± +* ``...`` → … +* ``?....`` → ?.. +* ``!....`` → !.. +* ``????????`` → ??? +* ``!!!!!`` → !!! +* ``,,,`` → , +* ``--`` → &ndash +* ``---`` → &mdash """ import logging import re @@ -50,7 +55,7 @@ SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": "™"} -def replaceFn(match: Match): +def replaceFn(match: Match[str]): return SCOPED_ABBR[match.group(1).lower()]