Skip to content

Commit b3dd749

Browse files
authored
Merge pull request #2932 from plotly/text-entities
improve pseudo-html entity conversion
2 parents 19f3800 + a943db2 commit b3dd749

File tree

4 files changed

+156
-96
lines changed

4 files changed

+156
-96
lines changed

src/constants/string_mappings.js

-41
This file was deleted.

src/lib/html2unicode.js

+1-23
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
'use strict';
1111

1212
var toSuperScript = require('superscript-text');
13-
var stringMappings = require('../constants/string_mappings');
13+
var fixEntities = require('./svg_text_utils').convertEntities;
1414

1515
function fixSuperScript(x) {
1616
var idx = 0;
@@ -33,28 +33,6 @@ function stripTags(x) {
3333
return x.replace(/\<.*\>/g, '');
3434
}
3535

36-
function fixEntities(x) {
37-
var entityToUnicode = stringMappings.entityToUnicode;
38-
var idx = 0;
39-
40-
while((idx = x.indexOf('&', idx)) >= 0) {
41-
var nidx = x.indexOf(';', idx);
42-
if(nidx < idx) {
43-
idx += 1;
44-
continue;
45-
}
46-
47-
var entity = entityToUnicode[x.slice(idx + 1, nidx)];
48-
if(entity) {
49-
x = x.slice(0, idx) + entity + x.slice(nidx + 1);
50-
} else {
51-
x = x.slice(0, idx) + x.slice(nidx + 1);
52-
}
53-
}
54-
55-
return x;
56-
}
57-
5836
function convertHTMLToUnicode(html) {
5937
return '' +
6038
fixEntities(

src/lib/svg_text_utils.js

+80-29
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ var d3 = require('d3');
1515

1616
var Lib = require('../lib');
1717
var xmlnsNamespaces = require('../constants/xmlns_namespaces');
18-
var stringMappings = require('../constants/string_mappings');
1918
var LINE_SPACING = require('../constants/alignment').LINE_SPACING;
2019

2120
// text converter
@@ -223,13 +222,6 @@ var PROTOCOLS = ['http:', 'https:', 'mailto:', '', undefined, ':'];
223222

224223
var STRIP_TAGS = new RegExp('</?(' + Object.keys(TAG_STYLES).join('|') + ')( [^>]*)?/?>', 'g');
225224

226-
var ENTITY_TO_UNICODE = Object.keys(stringMappings.entityToUnicode).map(function(k) {
227-
return {
228-
regExp: new RegExp('&' + k + ';', 'g'),
229-
sub: stringMappings.entityToUnicode[k]
230-
};
231-
});
232-
233225
var NEWLINES = /(\r\n?|\n)/g;
234226

235227
var SPLIT_TAGS = /(<[^<>]*>)/;
@@ -254,6 +246,14 @@ var BR_TAG = /<br(\s+.*)?>/i;
254246
*
255247
* Because we hack in other attributes with style (sub & sup), drop any trailing
256248
* semicolon in user-supplied styles so we can consistently append the tag-dependent style
249+
*
250+
* These are for tag attributes; Chrome anyway will convert entities in
251+
* attribute values, but not in attribute names
252+
* you can test this by for example:
253+
* > p = document.createElement('p')
254+
* > p.innerHTML = '<span styl&#x65;="font-color:r&#x65;d;">Hi</span>'
255+
* > p.innerHTML
256+
* <- '<span styl&#x65;="font-color:red;">Hi</span>'
257257
*/
258258
var STYLEMATCH = /(^|[\s"'])style\s*=\s*("([^"]*);?"|'([^']*);?')/i;
259259
var HREFMATCH = /(^|[\s"'])href\s*=\s*("([^"]*)"|'([^']*)')/i;
@@ -265,7 +265,8 @@ var POPUPMATCH = /(^|[\s"'])popup\s*=\s*("([\w=,]*)"|'([\w=,]*)')/i;
265265
function getQuotedMatch(_str, re) {
266266
if(!_str) return null;
267267
var match = _str.match(re);
268-
return match && (match[3] || match[4]);
268+
var result = match && (match[3] || match[4]);
269+
return result && convertEntities(result);
269270
}
270271

271272
var COLORMATCH = /(^|;)\s*color:/;
@@ -276,19 +277,70 @@ exports.plainText = function(_str) {
276277
return (_str || '').replace(STRIP_TAGS, ' ');
277278
};
278279

279-
function replaceFromMapObject(_str, list) {
280-
if(!_str) return '';
280+
/*
281+
* N.B. HTML entities are listed without the leading '&' and trailing ';'
282+
* https://www.freeformatter.com/html-entities.html
283+
*
284+
* FWIW if we wanted to support the full set, it has 2261 entries:
285+
* https://www.w3.org/TR/html5/entities.json
286+
* though I notice that some of these are duplicates and/or are missing ";"
287+
* eg: "&amp;", "&amp", "&AMP;", and "&AMP" all map to "&"
288+
* We no longer need to include numeric entities here, these are now handled
289+
* by String.fromCodePoint/fromCharCode
290+
*
291+
* Anyway the only ones that are really important to allow are the HTML special
292+
* chars <, >, and &, because these ones can trigger special processing if not
293+
* replaced by the corresponding entity.
294+
*/
295+
var entityToUnicode = {
296+
mu: 'μ',
297+
amp: '&',
298+
lt: '<',
299+
gt: '>',
300+
nbsp: ' ',
301+
times: '×',
302+
plusmn: '±',
303+
deg: '°'
304+
};
281305

282-
for(var i = 0; i < list.length; i++) {
283-
var item = list[i];
284-
_str = _str.replace(item.regExp, item.sub);
285-
}
306+
// NOTE: in general entities can contain uppercase too (so [a-zA-Z]) but all the
307+
// ones we support use only lowercase. If we ever change that, update the regex.
308+
var ENTITY_MATCH = /&(#\d+|#x[\da-fA-F]+|[a-z]+);/g;
309+
function convertEntities(_str) {
310+
return _str.replace(ENTITY_MATCH, function(fullMatch, innerMatch) {
311+
var outChar;
312+
if(innerMatch.charAt(0) === '#') {
313+
// cannot use String.fromCodePoint in IE
314+
outChar = fromCodePoint(
315+
innerMatch.charAt(1) === 'x' ?
316+
parseInt(innerMatch.substr(2), 16) :
317+
parseInt(innerMatch.substr(1), 10)
318+
);
319+
}
320+
else outChar = entityToUnicode[innerMatch];
286321

287-
return _str;
322+
// as in regular HTML, if we didn't decode the entity just
323+
// leave the raw text in place.
324+
return outChar || fullMatch;
325+
});
288326
}
289-
290-
function convertEntities(_str) {
291-
return replaceFromMapObject(_str, ENTITY_TO_UNICODE);
327+
exports.convertEntities = convertEntities;
328+
329+
function fromCodePoint(code) {
330+
// Don't allow overflow. In Chrome this turns into � but I feel like it's
331+
// more useful to just not convert it at all.
332+
if(code > 0x10FFFF) return;
333+
var stringFromCodePoint = String.fromCodePoint;
334+
if(stringFromCodePoint) return stringFromCodePoint(code);
335+
336+
// IE doesn't have String.fromCodePoint
337+
// see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint
338+
var stringFromCharCode = String.fromCharCode;
339+
if(code <= 0xFFFF) return stringFromCharCode(code);
340+
return stringFromCharCode(
341+
(code >> 10) + 0xD7C0,
342+
(code % 0x400) + 0xDC00
343+
);
292344
}
293345

294346
/*
@@ -302,15 +354,14 @@ function convertEntities(_str) {
302354
* somewhat differently if it does, so just keep track of this when it happens.
303355
*/
304356
function buildSVGText(containerNode, str) {
305-
str = convertEntities(str)
306-
/*
307-
* Normalize behavior between IE and others wrt newlines and whitespace:pre
308-
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
309-
* Chrome and FF display \n, \r, or \r\n as a space in this mode.
310-
* I feel like at some point we turned these into <br> but currently we don't so
311-
* I'm just going to cement what we do now in Chrome and FF
312-
*/
313-
.replace(NEWLINES, ' ');
357+
/*
358+
* Normalize behavior between IE and others wrt newlines and whitespace:pre
359+
* this combination makes IE barf https://github.com/plotly/plotly.js/issues/746
360+
* Chrome and FF display \n, \r, or \r\n as a space in this mode.
361+
* I feel like at some point we turned these into <br> but currently we don't so
362+
* I'm just going to cement what we do now in Chrome and FF
363+
*/
364+
str = str.replace(NEWLINES, ' ');
314365

315366
var hasLink = false;
316367

@@ -435,7 +486,7 @@ function buildSVGText(containerNode, str) {
435486
newLine();
436487
}
437488
else if(tagStyle === undefined) {
438-
addTextNode(currentNode, parti);
489+
addTextNode(currentNode, convertEntities(parti));
439490
}
440491
else {
441492
// tag - open or close

test/jasmine/tests/svg_text_utils_test.js

+75-3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ describe('svg+text utils', function() {
88

99
describe('convertToTspans', function() {
1010

11+
var stringFromCodePoint;
12+
13+
beforeAll(function() {
14+
stringFromCodePoint = String.fromCodePoint;
15+
});
16+
17+
afterEach(function() {
18+
String.fromCodePoint = stringFromCodePoint;
19+
});
20+
1121
function mockTextSVGElement(txt) {
1222
return d3.select('body')
1323
.append('svg')
@@ -300,16 +310,78 @@ describe('svg+text utils', function() {
300310
'100 &times; 20 &plusmn; 0.5 &deg;'
301311
);
302312

303-
expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
313+
expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
304314
});
305315

306316
it('decodes some HTML entities in text (number case)', function() {
307317
var node = mockTextSVGElement(
308-
'100&#956; &#28; &#60; 10 &#62; 0 &#160;' +
318+
'100&#956; &#38; &#60; 10 &#62; 0 &#160;' +
309319
'100 &#215; 20 &#177; 0.5 &#176;'
310320
);
311321

312-
expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °');
322+
expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °');
323+
});
324+
325+
it('decodes arbitrary decimal and hex number entities', function() {
326+
var i = 0;
327+
for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) {
328+
var node = mockTextSVGElement(
329+
'&#x' + n.toString(16) +
330+
'; = &#' + n.toString() +
331+
'; = &#x' + n.toString(16).toUpperCase() + ';'
332+
);
333+
var char = String.fromCodePoint(n);
334+
expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n);
335+
i++;
336+
}
337+
// not really necessary to assert this, but we tested 355 characters,
338+
// weighted toward the low end but continuing all the way to the
339+
// end of the unicode definition
340+
expect(i).toBe(355);
341+
});
342+
343+
it('decodes arbitrary decimal and hex number entities (IE case)', function() {
344+
// IE does not have String.fromCodePoint
345+
String.fromCodePoint = undefined;
346+
expect(String.fromCodePoint).toBeUndefined();
347+
348+
var i = 0;
349+
for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) {
350+
var node = mockTextSVGElement(
351+
'&#x' + n.toString(16) +
352+
'; = &#' + n.toString() +
353+
'; = &#x' + n.toString(16).toUpperCase() + ';'
354+
);
355+
var char = stringFromCodePoint(n);
356+
expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n);
357+
i++;
358+
}
359+
// not really necessary to assert this, but we tested 355 characters,
360+
// weighted toward the low end but continuing all the way to the
361+
// end of the unicode definition
362+
expect(i).toBe(355);
363+
});
364+
365+
it('does not decode entities prematurely', function() {
366+
var testCases = [
367+
'&lt;b>not bold</b&gt;',
368+
'<b&gt;not bold</b&gt;',
369+
'&lt;b>not bold&lt;/b>',
370+
'<b&gt;not bold&lt;/b>',
371+
'&lt;b&gt;not bold&lt;/b&gt;'
372+
];
373+
testCases.forEach(function(testCase) {
374+
var node = mockTextSVGElement(testCase);
375+
376+
expect(node.html()).toBe(
377+
'&lt;b&gt;not bold&lt;/b&gt;', testCase
378+
);
379+
});
380+
381+
var controlNode = mockTextSVGElement('<b>bold</b>');
382+
expect(controlNode.html()).toBe(
383+
'<tspan style="font-weight:bold">bold</tspan>'
384+
);
313385
});
314386

315387
it('supports superscript by itself', function() {

0 commit comments

Comments
 (0)