Fixed blog readtime calculation to ignore non-content text (#7370)

sisp · web-flow · commit 6b13c560f5bf · 2024-07-16T15:49:13.000+02:00
diff --git a/material/plugins/blog/readtime/parser.py b/material/plugins/blog/readtime/parser.py
@@ -20,6 +20,10 @@
 
 from html.parser import HTMLParser
 
+# TODO: Refactor the `void` set into a common module and import it from there
+# and not from the search plugin.
+from material.plugins.search.plugin import void
+
 # -----------------------------------------------------------------------------
 # Classes
 # -----------------------------------------------------------------------------
@@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser):
     def __init__(self):
         super().__init__(convert_charrefs = True)
 
+        # Tags to skip
+        self.skip = set([
+            "object",                  # Objects
+            "script",                  # Scripts
+            "style",                   # Styles
+            "svg"                      # SVGs
+        ])
+
+        # Current context
+        self.context = []
+
         # Keep track of text and images
         self.text   = []
         self.images = 0
 
-    # Collect images
+    # Called at the start of every HTML tag
     def handle_starttag(self, tag, attrs):
+        # Collect images
         if tag == "img":
             self.images += 1
 
-    # Collect text
+        # Ignore self-closing tags
+        if tag not in void:
+            # Add tag to context
+            self.context.append(tag)
+
+    # Called for the text contents of each tag
     def handle_data(self, data):
-        self.text.append(data)
+        # Collect text if not inside skip context
+        if not self.skip.intersection(self.context):
+            self.text.append(data)
+
+    # Called at the end of every HTML tag
+    def handle_endtag(self, tag):
+        if self.context and self.context[-1] == tag:
+            # Remove tag from context
+            self.context.pop()
diff --git a/src/plugins/blog/readtime/parser.py b/src/plugins/blog/readtime/parser.py
@@ -20,6 +20,10 @@
 
 from html.parser import HTMLParser
 
+# TODO: Refactor the `void` set into a common module and import it from there
+# and not from the search plugin.
+from material.plugins.search.plugin import void
+
 # -----------------------------------------------------------------------------
 # Classes
 # -----------------------------------------------------------------------------
@@ -31,15 +35,40 @@ class ReadtimeParser(HTMLParser):
     def __init__(self):
         super().__init__(convert_charrefs = True)
 
+        # Tags to skip
+        self.skip = set([
+            "object",                  # Objects
+            "script",                  # Scripts
+            "style",                   # Styles
+            "svg"                      # SVGs
+        ])
+
+        # Current context
+        self.context = []
+
         # Keep track of text and images
         self.text   = []
         self.images = 0
 
-    # Collect images
+    # Called at the start of every HTML tag
     def handle_starttag(self, tag, attrs):
+        # Collect images
         if tag == "img":
             self.images += 1
 
-    # Collect text
+        # Ignore self-closing tags
+        if tag not in void:
+            # Add tag to context
+            self.context.append(tag)
+
+    # Called for the text contents of each tag
     def handle_data(self, data):
-        self.text.append(data)
+        # Collect text if not inside skip context
+        if not self.skip.intersection(self.context):
+            self.text.append(data)
+
+    # Called at the end of every HTML tag
+    def handle_endtag(self, tag):
+        if self.context and self.context[-1] == tag:
+            # Remove tag from context
+            self.context.pop()