1
+ /*
2
+ * HTML Parser By John Resig (ejohn.org)
3
+ * Original code by Erik Arvidsson, Mozilla Public License
4
+ * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
5
+ *
6
+ * // Use like so:
7
+ * htmlParser(htmlString, {
8
+ * start: function(tag, attrs, unary) {},
9
+ * end: function(tag) {},
10
+ * chars: function(text) {},
11
+ * comment: function(text) {}
12
+ * });
13
+ *
14
+ * // or to get an XML string:
15
+ * HTMLtoXML(htmlString);
16
+ *
17
+ * // or to get an XML DOM Document
18
+ * HTMLtoDOM(htmlString);
19
+ *
20
+ * // or to inject into an existing document/DOM node
21
+ * HTMLtoDOM(htmlString, document);
22
+ * HTMLtoDOM(htmlString, document.body);
23
+ *
24
+ */
25
+
26
+ ( function ( ) {
27
+
28
+ // Regular Expressions for parsing tags and attributes
29
+ var startTag = / ^ < ( \w + ) ( (?: \s + \w + (?: \s * = \s * (?: (?: " [ ^ " ] * " ) | (?: ' [ ^ ' ] * ' ) | [ ^ > \s ] + ) ) ? ) * ) \s * ( \/ ? ) > / ,
30
+ endTag = / ^ < \/ ( \w + ) [ ^ > ] * > / ,
31
+ attr = / ( \w + ) (?: \s * = \s * (?: (?: " ( (?: \\ .| [ ^ " ] ) * ) " ) | (?: ' ( (?: \\ .| [ ^ ' ] ) * ) ' ) | ( [ ^ > \s ] + ) ) ) ? / g;
32
+
33
+ // Empty Elements - HTML 4.01
34
+ var empty = makeMap ( "area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed" ) ;
35
+
36
+ // Block Elements - HTML 4.01
37
+ var block = makeMap ( "address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul" ) ;
38
+
39
+ // Inline Elements - HTML 4.01
40
+ var inline = makeMap ( "a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var" ) ;
41
+
42
+ // Elements that you can, intentionally, leave open
43
+ // (and which close themselves)
44
+ var closeSelf = makeMap ( "colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr" ) ;
45
+
46
+ // Attributes that have their values filled in disabled="disabled"
47
+ var fillAttrs = makeMap ( "checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected" ) ;
48
+
49
+ // Special Elements (can contain anything)
50
+ var special = makeMap ( "script,style" ) ;
51
+
52
+ var htmlParser = this . htmlParser = function ( html , handler ) {
53
+ var index , chars , match , stack = [ ] , last = html ;
54
+ stack . last = function ( ) {
55
+ return this [ this . length - 1 ] ;
56
+ } ;
57
+
58
+ while ( html ) {
59
+ chars = true ;
60
+
61
+ // Make sure we're not in a script or style element
62
+ if ( ! stack . last ( ) || ! special [ stack . last ( ) ] ) {
63
+
64
+ // Comment
65
+ if ( html . indexOf ( "<!--" ) == 0 ) {
66
+ index = html . indexOf ( "-->" ) ;
67
+
68
+ if ( index >= 0 ) {
69
+ if ( handler . comment )
70
+ handler . comment ( html . substring ( 4 , index ) ) ;
71
+ html = html . substring ( index + 3 ) ;
72
+ chars = false ;
73
+ }
74
+
75
+ // end tag
76
+ } else if ( html . indexOf ( "</" ) == 0 ) {
77
+ match = html . match ( endTag ) ;
78
+
79
+ if ( match ) {
80
+ html = html . substring ( match [ 0 ] . length ) ;
81
+ match [ 0 ] . replace ( endTag , parseEndTag ) ;
82
+ chars = false ;
83
+ }
84
+
85
+ // start tag
86
+ } else if ( html . indexOf ( "<" ) == 0 ) {
87
+ match = html . match ( startTag ) ;
88
+
89
+ if ( match ) {
90
+ html = html . substring ( match [ 0 ] . length ) ;
91
+ match [ 0 ] . replace ( startTag , parseStartTag ) ;
92
+ chars = false ;
93
+ }
94
+ }
95
+
96
+ if ( chars ) {
97
+ index = html . indexOf ( "<" ) ;
98
+
99
+ var text = index < 0 ? html : html . substring ( 0 , index ) ;
100
+ html = index < 0 ? "" : html . substring ( index ) ;
101
+
102
+ if ( handler . chars )
103
+ handler . chars ( text ) ;
104
+ }
105
+
106
+ } else {
107
+ html = html . replace ( new RegExp ( "(.*)<\/" + stack . last ( ) + "[^>]*>" ) , function ( all , text ) {
108
+ text = text . replace ( / < ! - - ( .* ?) - - > / g, "$1" )
109
+ . replace ( / < ! \[ C D A T A \[ ( .* ?) ] ] > / g, "$1" ) ;
110
+
111
+ if ( handler . chars )
112
+ handler . chars ( text ) ;
113
+
114
+ return "" ;
115
+ } ) ;
116
+
117
+ parseEndTag ( "" , stack . last ( ) ) ;
118
+ }
119
+
120
+ if ( html == last )
121
+ throw "Parse Error: " + html ;
122
+ last = html ;
123
+ }
124
+
125
+ // Clean up any remaining tags
126
+ parseEndTag ( ) ;
127
+
128
+ function parseStartTag ( tag , tagName , rest , unary ) {
129
+ if ( block [ tagName ] ) {
130
+ while ( stack . last ( ) && inline [ stack . last ( ) ] ) {
131
+ parseEndTag ( "" , stack . last ( ) ) ;
132
+ }
133
+ }
134
+
135
+ if ( closeSelf [ tagName ] && stack . last ( ) == tagName ) {
136
+ parseEndTag ( "" , tagName ) ;
137
+ }
138
+
139
+ unary = empty [ tagName ] || ! ! unary ;
140
+
141
+ if ( ! unary )
142
+ stack . push ( tagName ) ;
143
+
144
+ if ( handler . start ) {
145
+ var attrs = [ ] ;
146
+
147
+ rest . replace ( attr , function ( match , name ) {
148
+ var value = arguments [ 2 ] ? arguments [ 2 ] :
149
+ arguments [ 3 ] ? arguments [ 3 ] :
150
+ arguments [ 4 ] ? arguments [ 4 ] :
151
+ fillAttrs [ name ] ? name : "" ;
152
+
153
+ attrs . push ( {
154
+ name : name ,
155
+ value : value ,
156
+ escaped : value . replace ( / ( ^ | [ ^ \\ ] ) " / g, '$1\\\"' ) //"
157
+ } ) ;
158
+ } ) ;
159
+
160
+ if ( handler . start )
161
+ handler . start ( tagName , attrs , unary ) ;
162
+ }
163
+ }
164
+
165
+ function parseEndTag ( tag , tagName ) {
166
+ // If no tag name is provided, clean shop
167
+ if ( ! tagName )
168
+ var pos = 0 ;
169
+
170
+ // Find the closest opened tag of the same type
171
+ else
172
+ for ( var pos = stack . length - 1 ; pos >= 0 ; pos -- )
173
+ if ( stack [ pos ] == tagName )
174
+ break ;
175
+
176
+ if ( pos >= 0 ) {
177
+ // Close all the open elements, up the stack
178
+ for ( var i = stack . length - 1 ; i >= pos ; i -- )
179
+ if ( handler . end )
180
+ handler . end ( stack [ i ] ) ;
181
+
182
+ // Remove the open elements from the stack
183
+ stack . length = pos ;
184
+ }
185
+ }
186
+ } ;
187
+
188
+ this . HTMLtoXML = function ( html ) {
189
+ var results = "" ;
190
+
191
+ htmlParser ( html , {
192
+ start : function ( tag , attrs , unary ) {
193
+ results += "<" + tag ;
194
+
195
+ for ( var i = 0 ; i < attrs . length ; i ++ )
196
+ results += " " + attrs [ i ] . name + '="' + attrs [ i ] . escaped + '"' ;
197
+
198
+ results += ( unary ? "/" : "" ) + ">" ;
199
+ } ,
200
+ end : function ( tag ) {
201
+ results += "</" + tag + ">" ;
202
+ } ,
203
+ chars : function ( text ) {
204
+ results += text ;
205
+ } ,
206
+ comment : function ( text ) {
207
+ results += "<!--" + text + "-->" ;
208
+ }
209
+ } ) ;
210
+
211
+ return results ;
212
+ } ;
213
+
214
+ this . HTMLtoDOM = function ( html , doc ) {
215
+ // There can be only one of these elements
216
+ var one = makeMap ( "html,head,body,title" ) ;
217
+
218
+ // Enforce a structure for the document
219
+ var structure = {
220
+ link : "head" ,
221
+ base : "head"
222
+ } ;
223
+
224
+ if ( ! doc ) {
225
+ if ( typeof DOMDocument != "undefined" )
226
+ doc = new DOMDocument ( ) ;
227
+ else if ( typeof document != "undefined" && document . implementation && document . implementation . createDocument )
228
+ doc = document . implementation . createDocument ( "" , "" , null ) ;
229
+ else if ( typeof ActiveX != "undefined" )
230
+ doc = new ActiveXObject ( "Msxml.DOMDocument" ) ;
231
+
232
+ } else
233
+ doc = doc . ownerDocument ||
234
+ doc . getOwnerDocument && doc . getOwnerDocument ( ) ||
235
+ doc ;
236
+
237
+ var elems = [ ] ,
238
+ documentElement = doc . documentElement ||
239
+ doc . getDocumentElement && doc . getDocumentElement ( ) ;
240
+
241
+ // If we're dealing with an empty document then we
242
+ // need to pre-populate it with the HTML document structure
243
+ if ( ! documentElement && doc . createElement ) ( function ( ) {
244
+ var html = doc . createElement ( "html" ) ;
245
+ var head = doc . createElement ( "head" ) ;
246
+ head . appendChild ( doc . createElement ( "title" ) ) ;
247
+ html . appendChild ( head ) ;
248
+ html . appendChild ( doc . createElement ( "body" ) ) ;
249
+ doc . appendChild ( html ) ;
250
+ } ) ( ) ;
251
+
252
+ // Find all the unique elements
253
+ if ( doc . getElementsByTagName )
254
+ for ( var i in one )
255
+ one [ i ] = doc . getElementsByTagName ( i ) [ 0 ] ;
256
+
257
+ // If we're working with a document, inject contents into
258
+ // the body element
259
+ var curParentNode = one . body ;
260
+
261
+ htmlParser ( html , {
262
+ start : function ( tagName , attrs , unary ) {
263
+ // If it's a pre-built element, then we can ignore
264
+ // its construction
265
+ if ( one [ tagName ] ) {
266
+ curParentNode = one [ tagName ] ;
267
+ return ;
268
+ }
269
+
270
+ var elem = doc . createElement ( tagName ) ;
271
+
272
+ for ( var attr in attrs )
273
+ elem . setAttribute ( attrs [ attr ] . name , attrs [ attr ] . value ) ;
274
+
275
+ if ( structure [ tagName ] && typeof one [ structure [ tagName ] ] != "boolean" )
276
+ one [ structure [ tagName ] ] . appendChild ( elem ) ;
277
+
278
+ else if ( curParentNode && curParentNode . appendChild )
279
+ curParentNode . appendChild ( elem ) ;
280
+
281
+ if ( ! unary ) {
282
+ elems . push ( elem ) ;
283
+ curParentNode = elem ;
284
+ }
285
+ } ,
286
+ end : function ( tag ) {
287
+ elems . length -= 1 ;
288
+
289
+ // Init the new parentNode
290
+ curParentNode = elems [ elems . length - 1 ] ;
291
+ } ,
292
+ chars : function ( text ) {
293
+ curParentNode . appendChild ( doc . createTextNode ( text ) ) ;
294
+ } ,
295
+ comment : function ( text ) {
296
+ // create comment node
297
+ }
298
+ } ) ;
299
+
300
+ return doc ;
301
+ } ;
302
+
303
+ function makeMap ( str ) {
304
+ var obj = { } , items = str . split ( "," ) ;
305
+ for ( var i = 0 ; i < items . length ; i ++ )
306
+ obj [ items [ i ] ] = true ;
307
+ return obj ;
308
+ }
309
+ } ) ( ) ;
0 commit comments