Press n or j to go to the next uncovered block, b, p or k for the previous block.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 66930x 66930x 66930x 66930x 1590x 1590x 65340x 65340x 2x 2x 30x 30x 30x 30x 30x 30x 30x 30x 30x 30x 2x 2x 2x 2x 2x 2x 2x 2x 2x 28081x 28081x 28081x 28081x 28081x 28081x 28081x 107x 107x 107x 107x 85x 107x 6x 22x 16x 16x 107x 107x 107x 107x 107x 28081x 28081x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 107x 107x 107x 2x 2x 105x 105x 107x 55x 55x 50x 50x 50x 107x 6x 6x 44x 44x 44x 44x 44x 107x 107x | import entities from './entities.js'; const windows_1252 = [ 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376 ]; /** * @param {string} entity_name * @param {boolean} is_attribute_value */ function reg_exp_entity(entity_name, is_attribute_value) { // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state // doesn't decode the html entity which not ends with ; and next character is =, number or alphabet in attribute value. if (is_attribute_value && !entity_name.endsWith(';')) { return `${entity_name}\\b(?!=)`; } return entity_name; } /** @param {boolean} is_attribute_value */ function get_entity_pattern(is_attribute_value) { const reg_exp_num = '#(?:x[a-fA-F\\d]+|\\d+)(?:;)?'; const reg_exp_entities = Object.keys(entities).map( /** @param {any} entity_name */ (entity_name) => reg_exp_entity(entity_name, is_attribute_value) ); const entity_pattern = new RegExp(`&(${reg_exp_num}|${reg_exp_entities.join('|')})`, 'g'); return entity_pattern; } const entity_pattern_content = get_entity_pattern(false); const entity_pattern_attr_value = get_entity_pattern(true); /** * @param {string} html * @param {boolean} is_attribute_value */ export function decode_character_references(html, is_attribute_value) { const entity_pattern = is_attribute_value ? entity_pattern_attr_value : entity_pattern_content; return html.replace( entity_pattern, /** * @param {any} match * @param {keyof typeof entities} entity */ (match, entity) => { let code; // Handle named entities if (entity[0] !== '#') { code = entities[entity]; } else if (entity[1] === 'x') { code = parseInt(entity.substring(2), 16); } else { code = parseInt(entity.substring(1), 10); } if (!code) { return match; } return String.fromCodePoint(validate_code(code)); } ); } const NUL = 0; // some code points are verboten. If we were inserting HTML, the browser would replace the illegal // code points with alternatives in some cases - since we're bypassing that mechanism, we need // to replace them ourselves // // Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters /** @param {number} code */ function validate_code(code) { // line feed becomes generic whitespace if (code === 10) { return 32; } // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...) if (code < 128) { return code; } // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need // to correct the mistake or we'll end up with missing € signs and so on if (code <= 159) { return windows_1252[code - 128]; } // basic multilingual plane if (code < 55296) { return code; } // UTF-16 surrogate halves if (code <= 57343) { return NUL; } // rest of the basic multilingual plane if (code <= 65535) { return code; } // supplementary multilingual plane 0x10000 - 0x1ffff if (code >= 65536 && code <= 131071) { return code; } // supplementary ideographic plane 0x20000 - 0x2ffff if (code >= 131072 && code <= 196607) { return code; } return NUL; } |