Skip to content

Commit 6b13c56

Browse files
authored
Fixed blog readtime calculation to ignore non-content text (#7370)
1 parent 4f8081c commit 6b13c56

File tree

2 files changed

+64
-6
lines changed

2 files changed

+64
-6
lines changed

Diff for: material/plugins/blog/readtime/parser.py

+32-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020

2121
from html.parser import HTMLParser
2222

23+
# TODO: Refactor the `void` set into a common module and import it from there
24+
# and not from the search plugin.
25+
from material.plugins.search.plugin import void
26+
2327
# -----------------------------------------------------------------------------
2428
# Classes
2529
# -----------------------------------------------------------------------------
@@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser):
3135
def __init__(self):
3236
super().__init__(convert_charrefs = True)
3337

38+
# Tags to skip
39+
self.skip = set([
40+
"object", # Objects
41+
"script", # Scripts
42+
"style", # Styles
43+
"svg" # SVGs
44+
])
45+
46+
# Current context
47+
self.context = []
48+
3449
# Keep track of text and images
3550
self.text = []
3651
self.images = 0
3752

38-
# Collect images
53+
# Called at the start of every HTML tag
3954
def handle_starttag(self, tag, attrs):
55+
# Collect images
4056
if tag == "img":
4157
self.images += 1
4258

43-
# Collect text
59+
# Ignore self-closing tags
60+
if tag not in void:
61+
# Add tag to context
62+
self.context.append(tag)
63+
64+
# Called for the text contents of each tag
4465
def handle_data(self, data):
45-
self.text.append(data)
66+
# Collect text if not inside skip context
67+
if not self.skip.intersection(self.context):
68+
self.text.append(data)
69+
70+
# Called at the end of every HTML tag
71+
def handle_endtag(self, tag):
72+
if self.context and self.context[-1] == tag:
73+
# Remove tag from context
74+
self.context.pop()

Diff for: src/plugins/blog/readtime/parser.py

+32-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020

2121
from html.parser import HTMLParser
2222

23+
# TODO: Refactor the `void` set into a common module and import it from there
24+
# and not from the search plugin.
25+
from material.plugins.search.plugin import void
26+
2327
# -----------------------------------------------------------------------------
2428
# Classes
2529
# -----------------------------------------------------------------------------
@@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser):
3135
def __init__(self):
3236
super().__init__(convert_charrefs = True)
3337

38+
# Tags to skip
39+
self.skip = set([
40+
"object", # Objects
41+
"script", # Scripts
42+
"style", # Styles
43+
"svg" # SVGs
44+
])
45+
46+
# Current context
47+
self.context = []
48+
3449
# Keep track of text and images
3550
self.text = []
3651
self.images = 0
3752

38-
# Collect images
53+
# Called at the start of every HTML tag
3954
def handle_starttag(self, tag, attrs):
55+
# Collect images
4056
if tag == "img":
4157
self.images += 1
4258

43-
# Collect text
59+
# Ignore self-closing tags
60+
if tag not in void:
61+
# Add tag to context
62+
self.context.append(tag)
63+
64+
# Called for the text contents of each tag
4465
def handle_data(self, data):
45-
self.text.append(data)
66+
# Collect text if not inside skip context
67+
if not self.skip.intersection(self.context):
68+
self.text.append(data)
69+
70+
# Called at the end of every HTML tag
71+
def handle_endtag(self, tag):
72+
if self.context and self.context[-1] == tag:
73+
# Remove tag from context
74+
self.context.pop()

0 commit comments

Comments
 (0)