Temporarily encode <a href ...>...</a>
into something else, remove all other tags then restore the <a>
tags:
string.
replace(/<a(.*?)>/g,'\0$1\0').
replace(/<\/a>/,'\1').
replace(/<[^>]*>/,'').
replace(/\0(.*?)\0/,'<a$1>').
replace(/\1/,'</a>');
In the code above I use the NUL and SOH characters (ASCII 0x00 and 0x01) as replacements for <a>
tags simply because it is highly unlikely that they would appear in strings. Feel free to replace them with any other character or sequence of characters that would not appear in your string.
From additional comments it appears you're operating in a browser. In which case the browser has already parsed the HTML for you into a nice DOM tree. Use DOM methods to parse through the tree and process it the way you want:
function simpleHTML (domNode) {
var ret = "";
if (domNode.nodeType === Node.ELEMENT_NODE) {
var children = domNode.childNodes;
for (var i=0;i<children.length;i++) {
var child = children[i];
if (child.nodeName != 'SCRIPT') {
if (child.nodeName == 'A') {
ret += '<a href="' + child.href + '">' +
simpleHTML(child) +
'</a>';
}
else {
ret += simpleHTML(child);
}
}
}
}
else if (domNode.nodeType === Node.TEXT_NODE) {
ret += domNode.nodeValue;
}
return ret;
}
var simpleDocument = simpleHTML(document.body);
var simpleDiv = simpleHTML(document.getElementById('some_div'));
var temp = document.createElement('DIV');
temp.innerHTML = original_string;
simple_string = simpleHTML(temp);
Post a Comment for "Extract Text And Links From HTML Using Regular Expressions"