Skip to content

Commit 5af199a

Browse files
committed
v0.39 - better parsing for codeblocks, html and other markups, modularized more; see
1 parent 31cb074 commit 5af199a

File tree

7 files changed

+82
-56
lines changed

7 files changed

+82
-56
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Other requirements:
3131
- Use the `configmerger.py` to update old configuration files into a newer version's `config.ini`. You can do this by creating a copy of your existing config to i.e. a file named `myconfig.txt` and including in it the lines you want to keep for the newer version. Then, just run `python configmerger.py config.ini myconfig.txt` and all your existing config lines will be migrated to the new one. Works in most cases, but remember to be careful and double-check any migration issues with i.e. `diff`!
3232

3333
# Changelog
34+
- v0.39 - better parsing for codeblocks, html and other markups, modularized more; see `modules.py`
3435
- v0.38 - keep better record of daily token usage, streamlined (**note**: you will need to clear out your existing `token_usage.json`, the file structure has changed from the previous version)
3536
- v0.37 - better enforcing of voice msg limits
3637
- v0.36 - bot command fixes and adjustments

api_key.py

100755100644
File mode changed.

bot_commands.py

100755100644
File mode changed.

bot_token.py

100755100644
File mode changed.

main.py

Lines changed: 9 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@
55
# https://github.com/FlyingFathead/TelegramBot-OpenAI-API
66
#
77
# version of this program
8-
version_number = "0.38"
8+
version_number = "0.39"
99

10-
# test modules
11-
# import aiohttp # For asynchronous HTTP requests
10+
# experimental modules
1211
import requests
1312

1413
# main modules
@@ -41,6 +40,7 @@
4140
from api_key import get_api_key
4241
import bot_commands
4342
import utils
43+
from modules import count_tokens, read_total_token_usage, write_total_token_usage, markdown_to_html
4444

4545
# Call the startup message function
4646
utils.print_startup_message(version_number)
@@ -155,34 +155,16 @@ def check_global_rate_limit(self):
155155

156156
# count token usage
157157
def count_tokens(self, text):
158-
return len(tokenizer.encode(text))
159-
158+
return count_tokens(text, tokenizer)
159+
160160
# read and write token usage
161161
# detect date changes and reset token counter accordingly
162162
def read_total_token_usage(self):
163-
try:
164-
with open(self.token_usage_file, 'r') as file:
165-
data = json.load(file)
166-
current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
167-
# Return the usage for the current date, or 0 if not present
168-
return data.get(current_date, 0)
169-
except (FileNotFoundError, json.JSONDecodeError):
170-
# If the file doesn't exist or is invalid, return 0
171-
return 0
163+
return read_total_token_usage(self.token_usage_file)
172164

173165
# write latest token count data
174166
def write_total_token_usage(self, usage):
175-
try:
176-
with open(self.token_usage_file, 'r') as file:
177-
data = json.load(file)
178-
except (FileNotFoundError, json.JSONDecodeError):
179-
data = {} # Initialize a new dictionary if the file doesn't exist or is invalid
180-
181-
current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
182-
data[current_date] = usage # Update the current date's usage
183-
184-
with open(self.token_usage_file, 'w') as file:
185-
json.dump(data, file)
167+
write_total_token_usage(self.token_usage_file, usage)
186168

187169
# logging functionality
188170
def log_message(self, message_type, user_id, message):
@@ -246,34 +228,6 @@ def retrieve_chat_history(self):
246228
def split_large_messages(self, message, max_length=4096):
247229
return [message[i:i+max_length] for i in range(0, len(message), max_length)]
248230

249-
# convert markdowns to html
250-
def markdown_to_html(self, text):
251-
# Escape HTML special characters
252-
text = (text.replace('&', '&')
253-
.replace('<', '&lt;')
254-
.replace('>', '&gt;')
255-
.replace('"', '&quot;'))
256-
257-
# Convert markdown code blocks to HTML <pre> tags
258-
text = re.sub(r'```(.*?)```', r'<pre>\1</pre>', text, flags=re.DOTALL)
259-
260-
# Convert markdown inline code to HTML <code> tags
261-
text = re.sub(r'`(.*?)`', r'<code>\1</code>', text)
262-
263-
# Convert bold text using markdown syntax to HTML <b> tags
264-
text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
265-
266-
# Convert italic text using markdown syntax to HTML <i> tags
267-
# The regex here is looking for a standalone asterisk or underscore that could denote italics
268-
# It's also making sure that it doesn't capture bold syntax by checking that an asterisk or underscore is not followed or preceded by another asterisk or underscore
269-
text = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)', r'<i>\1</i>', text)
270-
text = re.sub(r'(?<!_)_(?!_)(.+?)(?<!_)_(?!_)', r'<i>\1</i>', text)
271-
272-
# Convert [text](url) to clickable links
273-
text = re.sub(r'\[(.*?)\]\((https?://\S+)\)', r'<a href="\2">\1</a>', text)
274-
275-
return text
276-
277231
# ~~~~~~~~~~~~~~~~~~~~~
278232
# voice message handler
279233
# ~~~~~~~~~~~~~~~~~~~~~
@@ -524,9 +478,8 @@ async def handle_message(self, update: Update, context: CallbackContext) -> None
524478
context.chat_data['chat_history'] = chat_history
525479

526480
print("Reply message before escaping:", bot_reply, flush=True)
527-
# escaped_reply = escape_markdown(bot_reply, version=2)
528-
# escaped_reply = escape_markdown_v2(bot_reply)
529-
escaped_reply = self.markdown_to_html(bot_reply)
481+
482+
escaped_reply = markdown_to_html(bot_reply)
530483
print("Reply message after escaping:", escaped_reply, flush=True)
531484

532485
# Log the bot's response

modules.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# modules.py
2+
import json
3+
import datetime
4+
from transformers import GPT2Tokenizer
5+
import re
6+
7+
# count tokens
8+
def count_tokens(text, tokenizer):
9+
return len(tokenizer.encode(text))
10+
11+
# read total token usage
12+
def read_total_token_usage(token_usage_file):
13+
try:
14+
with open(token_usage_file, 'r') as file:
15+
data = json.load(file)
16+
current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
17+
# Return the usage for the current date, or 0 if not present
18+
return data.get(current_date, 0)
19+
except (FileNotFoundError, json.JSONDecodeError):
20+
# If the file doesn't exist or is invalid, return 0
21+
return 0
22+
23+
# write latest token count data
24+
def write_total_token_usage(token_usage_file, usage):
25+
try:
26+
with open(token_usage_file, 'r') as file:
27+
data = json.load(file)
28+
except (FileNotFoundError, json.JSONDecodeError):
29+
data = {} # Initialize a new dictionary if the file doesn't exist or is invalid
30+
31+
current_date = datetime.datetime.utcnow().strftime('%Y-%m-%d')
32+
data[current_date] = usage # Update the current date's usage
33+
34+
with open(token_usage_file, 'w') as file:
35+
json.dump(data, file)
36+
37+
# convert markdowns to html
38+
def escape_html(text):
39+
# Escape HTML special characters
40+
return (text.replace('&', '&amp;')
41+
.replace('<', '&lt;')
42+
.replace('>', '&gt;')
43+
.replace('"', '&quot;'))
44+
45+
def markdown_to_html(text):
46+
# Split the text into code blocks and other parts
47+
parts = re.split(r'(```.*?```)', text, flags=re.DOTALL)
48+
for i, part in enumerate(parts):
49+
# Only process non-code blocks
50+
if not part.startswith('```'):
51+
part = escape_html(part)
52+
part = re.sub(r'`(.*?)`', r'<code>\1</code>', part)
53+
part = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', part)
54+
part = re.sub(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)', r'<i>\1</i>', part)
55+
part = re.sub(r'(?<!_)_(?!_)(.+?)(?<!_)_(?!_)', r'<i>\1</i>', part)
56+
part = re.sub(r'\[(.*?)\]\((https?://\S+)\)', r'<a href="\2">\1</a>', part)
57+
parts[i] = part
58+
else:
59+
# For code blocks, extract the language hint (if any)
60+
language_match = re.match(r'```(\w+)\s', part)
61+
language = language_match.group(1) if language_match else ''
62+
# Remove the language hint and backticks from the actual code
63+
code_content = re.sub(r'```(\w+)?\s', '', part, count=1)
64+
code_content = code_content.rstrip('`').rstrip()
65+
# Wrap the code with <pre> and <code>
66+
parts[i] = f'<pre><code class="{language}">{code_content}</code></pre>'
67+
68+
# Reassemble the parts into the final HTML, removing extra newlines after code blocks
69+
# text = ''.join(parts).replace('</pre>\n\n', '</pre>\n')
70+
# return text
71+
# Reassemble the parts into the final HTML
72+
return ''.join(parts)

utils.py

100755100644
File mode changed.

0 commit comments

Comments
 (0)