FlyingFathead
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎api_key.py
100755100644 b/‎api_key.py
100755100644
diff --git a/‎bot_commands.py
100755100644 b/‎bot_commands.py
100755100644
diff --git a/‎bot_token.py
100755100644 b/‎bot_token.py
100755100644
diff --git a/‎main.py
Lines changed: 9 additions & 56 deletions b/‎main.py
Lines changed: 9 additions & 56 deletions
diff --git a/‎modules.py
Lines changed: 72 additions & 0 deletions b/‎modules.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎utils.py
100755100644 b/‎utils.py
100755100644
@@ -31,6 +31,7 @@ Other requirements:
 - Use the `configmerger.py` to update old configuration files into a newer version's `config.ini`. You can do this by creating a copy of your existing config to i.e. a file named `myconfig.txt` and including in it the lines you want to keep for the newer version. Then, just run `python configmerger.py config.ini myconfig.txt` and all your existing config lines will be migrated to the new one. Works in most cases, but remember to be careful and double-check any migration issues with i.e. `diff`!
 
 # Changelog
+- v0.39 - better parsing for codeblocks, html and other markups, modularized more; see `modules.py`
 - v0.38 - keep better record of daily token usage, streamlined (**note**: you will need to clear out your existing `token_usage.json`, the file structure has changed from the previous version)
 - v0.37 - better enforcing of voice msg limits
 - v0.36 - bot command fixes and adjustments
 
@@ -5,10 +5,9 @@
 # https://github.com/FlyingFathead/TelegramBot-OpenAI-API
 #
 # version of this program
-version_number = "0.38"
+version_number = "0.39"
 
-# test modules
-# import aiohttp  # For asynchronous HTTP requests
+# experimental modules
 import requests
 
 # main modules
@@ -41,6 +40,7 @@
 from api_key import get_api_key
 import bot_commands
 import utils
+from modules import count_tokens, read_total_token_usage, write_total_token_usage, markdown_to_html
 
 # Call the startup message function
 utils.print_startup_message(version_number)
@@ -155,34 +155,16 @@ def check_global_rate_limit(self):
 
     # count token usage
     def count_tokens(self, text):
-        return len(tokenizer.encode(text))
-
+        return count_tokens(text, tokenizer)
+    
     # read and write token usage
     # detect date changes and reset token counter accordingly
     def read_total_token_usage(self):
-        try:
-            with open(self.token_usage_file, 'r') as file:
-                data = json.load(file)
-            current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
-            # Return the usage for the current date, or 0 if not present
-            return data.get(current_date, 0)
-        except (FileNotFoundError, json.JSONDecodeError):
-            # If the file doesn't exist or is invalid, return 0
-            return 0
+        return read_total_token_usage(self.token_usage_file)
 
     # write latest token count data
     def write_total_token_usage(self, usage):
-        try:
-            with open(self.token_usage_file, 'r') as file:
-                data = json.load(file)
-        except (FileNotFoundError, json.JSONDecodeError):
-            data = {}  # Initialize a new dictionary if the file doesn't exist or is invalid
-
-        current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
-        data[current_date] = usage  # Update the current date's usage
-
-        with open(self.token_usage_file, 'w') as file:
-            json.dump(data, file)
+        write_total_token_usage(self.token_usage_file, usage)
 
     # logging functionality
     def log_message(self, message_type, user_id, message):
@@ -246,34 +228,6 @@ def retrieve_chat_history(self):
     def split_large_messages(self, message, max_length=4096):
         return [message[i:i+max_length] for i in range(0, len(message), max_length)]
 
-    # convert markdowns to html
-    def markdown_to_html(self, text):
-        # Escape HTML special characters
-        text = (text.replace('&', '&amp;')
-                    .replace('<', '&lt;')
-                    .replace('>', '&gt;')
-                    .replace('"', '&quot;'))
-
-        # Convert markdown code blocks to HTML <pre> tags
-        text = re.sub(r'```(.*?)```', r'<pre>\1</pre>', text, flags=re.DOTALL)
-
-        # Convert markdown inline code to HTML <code> tags
-        text = re.sub(r'`(.*?)`', r'<code>\1</code>', text)
-
-        # Convert bold text using markdown syntax to HTML <b> tags
-        text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
-
-        # Convert italic text using markdown syntax to HTML <i> tags
-        # The regex here is looking for a standalone asterisk or underscore that could denote italics
-        # It's also making sure that it doesn't capture bold syntax by checking that an asterisk or underscore is not followed or preceded by another asterisk or underscore
-        text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)', r'<i>\1</i>', text)
-        text = re.sub(r'(?<!_)_(?!_)(.+?)(?<!_)_(?!_)', r'<i>\1</i>', text)
-
-        # Convert [text](url) to clickable links
-        text = re.sub(r'\[(.*?)\]\((https?://\S+)\)', r'<a href="\2">\1</a>', text)
-
-        return text
-
 # ~~~~~~~~~~~~~~~~~~~~~
 # voice message handler
 # ~~~~~~~~~~~~~~~~~~~~~
@@ -524,9 +478,8 @@ async def handle_message(self, update: Update, context: CallbackContext) -> None
                     context.chat_data['chat_history'] = chat_history
 
                     print("Reply message before escaping:", bot_reply, flush=True)
-                    # escaped_reply = escape_markdown(bot_reply, version=2)
-                    # escaped_reply = escape_markdown_v2(bot_reply)
-                    escaped_reply = self.markdown_to_html(bot_reply)
+
+                    escaped_reply = markdown_to_html(bot_reply)
                     print("Reply message after escaping:", escaped_reply, flush=True)
 
                     # Log the bot's response
 
@@ -0,0 +1,72 @@
+# modules.py
+import json
+import datetime
+from transformers import GPT2Tokenizer
+import re
+
+# count tokens
+def count_tokens(text, tokenizer):
+    return len(tokenizer.encode(text))
+
+# read total token usage
+def read_total_token_usage(token_usage_file):
+    try:
+        with open(token_usage_file, 'r') as file:
+            data = json.load(file)
+        current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
+        # Return the usage for the current date, or 0 if not present
+        return data.get(current_date, 0)
+    except (FileNotFoundError, json.JSONDecodeError):
+        # If the file doesn't exist or is invalid, return 0
+        return 0
+
+# write latest token count data
+def write_total_token_usage(token_usage_file, usage):
+    try:
+        with open(token_usage_file, 'r') as file:
+            data = json.load(file)
+    except (FileNotFoundError, json.JSONDecodeError):
+        data = {}  # Initialize a new dictionary if the file doesn't exist or is invalid
+
+    current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
+    data[current_date] = usage  # Update the current date's usage
+
+    with open(token_usage_file, 'w') as file:
+        json.dump(data, file)
+
+# convert markdowns to html
+def escape_html(text):
+    # Escape HTML special characters
+    return (text.replace('&', '&amp;')
+                .replace('<', '&lt;')
+                .replace('>', '&gt;')
+                .replace('"', '&quot;'))
+
+def markdown_to_html(text):
+    # Split the text into code blocks and other parts
+    parts = re.split(r'(```.*?```)', text, flags=re.DOTALL)
+    for i, part in enumerate(parts):
+        # Only process non-code blocks
+        if not part.startswith('```'):
+            part = escape_html(part)
+            part = re.sub(r'`(.*?)`', r'<code>\1</code>', part)
+            part = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', part)
+            part = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)', r'<i>\1</i>', part)
+            part = re.sub(r'(?<!_)_(?!_)(.+?)(?<!_)_(?!_)', r'<i>\1</i>', part)
+            part = re.sub(r'\[(.*?)\]\((https?://\S+)\)', r'<a href="\2">\1</a>', part)
+            parts[i] = part
+        else:
+            # For code blocks, extract the language hint (if any)
+            language_match = re.match(r'```(\w+)\s', part)
+            language = language_match.group(1) if language_match else ''
+            # Remove the language hint and backticks from the actual code
+            code_content = re.sub(r'```(\w+)?\s', '', part, count=1)
+            code_content = code_content.rstrip('`').rstrip()
+            # Wrap the code with <pre> and <code>
+            parts[i] = f'<pre><code class="{language}">{code_content}</code></pre>'
+
+    # Reassemble the parts into the final HTML, removing extra newlines after code blocks
+    # text = ''.join(parts).replace('</pre>\n\n', '</pre>\n')
+    # return text
+    # Reassemble the parts into the final HTML
+    return ''.join(parts)