Skip to content

Commit 6cac15d

Browse files
authored
Fix source.match performance without specifying term string (#186)
Performance problem of `source.match(regexp)` was recently fixed by specifying terminator string. However, I think maintaining appropriate terminator string for a regexp is hard. I propose solving this performance issue by increasing bytes to read in each iteration.
1 parent 033d190 commit 6cac15d

File tree

2 files changed

+25
-23
lines changed

2 files changed

+25
-23
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,6 @@ class BaseParser
124124
}
125125

126126
module Private
127-
# Terminal requires two or more letters.
128-
INSTRUCTION_TERM = "?>"
129-
COMMENT_TERM = "-->"
130-
CDATA_TERM = "]]>"
131-
DOCTYPE_TERM = "]>"
132-
# Read to the end of DOCTYPE because there is no proper ENTITY termination
133-
ENTITY_TERM = DOCTYPE_TERM
134-
135127
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
136128
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
137129
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
@@ -253,7 +245,7 @@ def pull_event
253245
return process_instruction(start_position)
254246
elsif @source.match("<!", true)
255247
if @source.match("--", true)
256-
md = @source.match(/(.*?)-->/um, true, term: Private::COMMENT_TERM)
248+
md = @source.match(/(.*?)-->/um, true)
257249
if md.nil?
258250
raise REXML::ParseException.new("Unclosed comment", @source)
259251
end
@@ -320,7 +312,7 @@ def pull_event
320312
raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil?
321313
return [ :elementdecl, "<!ELEMENT" + md[1] ]
322314
elsif @source.match("ENTITY", true)
323-
match_data = @source.match(Private::ENTITYDECL_PATTERN, true, term: Private::ENTITY_TERM)
315+
match_data = @source.match(Private::ENTITYDECL_PATTERN, true)
324316
unless match_data
325317
raise REXML::ParseException.new("Malformed entity declaration", @source)
326318
end
@@ -389,14 +381,14 @@ def pull_event
389381
raise REXML::ParseException.new(message, @source)
390382
end
391383
return [:notationdecl, name, *id]
392-
elsif md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
384+
elsif md = @source.match(/--(.*?)-->/um, true)
393385
case md[1]
394386
when /--/, /-\z/
395387
raise REXML::ParseException.new("Malformed comment", @source)
396388
end
397389
return [ :comment, md[1] ] if md
398390
end
399-
elsif match = @source.match(/(%.*?;)\s*/um, true, term: Private::DOCTYPE_TERM)
391+
elsif match = @source.match(/(%.*?;)\s*/um, true)
400392
return [ :externalentity, match[1] ]
401393
elsif @source.match(/\]\s*>/um, true)
402394
@document_status = :after_doctype
@@ -436,15 +428,15 @@ def pull_event
436428
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
437429
raise REXML::ParseException.new("Malformed node", @source) unless md
438430
if md[0][0] == ?-
439-
md = @source.match(/--(.*?)-->/um, true, term: Private::COMMENT_TERM)
431+
md = @source.match(/--(.*?)-->/um, true)
440432

441433
if md.nil? || /--|-\z/.match?(md[1])
442434
raise REXML::ParseException.new("Malformed comment", @source)
443435
end
444436

445437
return [ :comment, md[1] ]
446438
else
447-
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true, term: Private::CDATA_TERM)
439+
md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true)
448440
return [ :cdata, md[1] ] if md
449441
end
450442
raise REXML::ParseException.new( "Declarations can only occur "+
@@ -673,7 +665,7 @@ def parse_id_invalid_details(accept_external_id:,
673665
end
674666

675667
def process_instruction(start_position)
676-
match_data = @source.match(Private::INSTRUCTION_END, true, term: Private::INSTRUCTION_TERM)
668+
match_data = @source.match(Private::INSTRUCTION_END, true)
677669
unless match_data
678670
message = "Invalid processing instruction node"
679671
@source.position = start_position

lib/rexml/source.rb

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def read_until(term)
117117
def ensure_buffer
118118
end
119119

120-
def match(pattern, cons=false, term: nil)
120+
def match(pattern, cons=false)
121121
if cons
122122
@scanner.scan(pattern).nil? ? nil : @scanner
123123
else
@@ -204,10 +204,20 @@ def initialize(arg, block_size=500, encoding=nil)
204204
end
205205
end
206206

207-
def read(term = nil)
207+
def read(term = nil, min_bytes = 1)
208208
term = encode(term) if term
209209
begin
210-
@scanner << readline(term)
210+
str = readline(term)
211+
@scanner << str
212+
read_bytes = str.bytesize
213+
begin
214+
while read_bytes < min_bytes
215+
str = readline(term)
216+
@scanner << str
217+
read_bytes += str.bytesize
218+
end
219+
rescue IOError
220+
end
211221
true
212222
rescue Exception, NameError
213223
@source = nil
@@ -237,10 +247,9 @@ def ensure_buffer
237247
read if @scanner.eos? && @source
238248
end
239249

240-
# Note: When specifying a string for 'pattern', it must not include '>' except in the following formats:
241-
# - ">"
242-
# - "XXX>" (X is any string excluding '>')
243-
def match( pattern, cons=false, term: nil )
250+
def match( pattern, cons=false )
251+
# To avoid performance issue, we need to increase bytes to read per scan
252+
min_bytes = 1
244253
while true
245254
if cons
246255
md = @scanner.scan(pattern)
@@ -250,7 +259,8 @@ def match( pattern, cons=false, term: nil )
250259
break if md
251260
return nil if pattern.is_a?(String)
252261
return nil if @source.nil?
253-
return nil unless read(term)
262+
return nil unless read(nil, min_bytes)
263+
min_bytes *= 2
254264
end
255265

256266
md.nil? ? nil : @scanner

0 commit comments

Comments
 (0)