首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >Javascript: REGEX将所有相对Urls更改为绝对Urls

Javascript: REGEX将所有相对Urls更改为绝对Urls
EN

Stack Overflow用户
提问于 2011-09-25 16:54:18
回答 5查看 19.2K关注 0票数 12

我目前正在创建一个REGEX网络摩天大楼/代理,但我在解析在源码的脚本部分找到的相关Urls时遇到了问题,我认为Node.js可以做到这一点。虽然还不知道我将如何做到这一点。

有什么我可以做的吗?

此外,我对一种更简单的方式也持开放态度,因为我对其他代理如何解析网站感到困惑。我认为大多数都只是美化的网站抓取器,可以读取网站的源码,并将所有链接/表单转发回代理。

EN

回答 5

Stack Overflow用户

回答已采纳

发布于 2011-09-25 17:45:20

高级HTML字符串替换函数

请注意OP,因为他请求了这样一个函数:将base_url更改为您的代理的basE URL,以获得所需的结果。

下面将显示两个函数(使用指南包含在代码中)。为了完全理解函数的行为,请确保您没有跳过此答案的任何部分。

  • rel_to_abs(urL) -此函数返回绝对URL。当传递具有共同信任协议的绝对URL时,它将立即返回此URL。否则,从base_url和函数参数生成绝对URL。正确解析相对//).
  • replace_all_rel_by_abs (.././.)-此函数将解析所有在中具有重要意义的URL的url()出现,例如../url()、链接和外部资源。有关已解析实例的完整列表,请参阅代码。有关从外部源清理HTML字符串书签的调整实现,请参见 (嵌入到document).
  • Test包中(在答案的底部):要测试函数的有效性,只需将粘贴到位置栏。

rel_to_abs - 解析相对URL

代码语言:javascript
复制
function rel_to_abs(url){
    /* Only accept commonly trusted protocols:
     * Only data-image URLs are accepted, Exotic flavours (escaped slash,
     * html-entitied characters) are not supported to keep the function fast */
  if(/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url))
         return url; //Url is already absolute

    var base_url = location.href.match(/^(.+)\/?(?:#.+)?$/)[0]+"/";
    if(url.substring(0,2) == "//")
        return location.protocol + url;
    else if(url.charAt(0) == "/")
        return location.protocol + "//" + location.host + url;
    else if(url.substring(0,2) == "./")
        url = "." + url;
    else if(/^\s*$/.test(url))
        return ""; //Empty = Return nothing
    else url = "../" + url;

    url = base_url + url;
    var i=0
    while(/\/\.\.\//.test(url = url.replace(/[^\/]+\/+\.\.\//g,"")));

    /* Escape certain characters to prevent XSS */
    url = url.replace(/\.$/,"").replace(/\/\./g,"").replace(/"/g,"%22")
            .replace(/'/g,"%27").replace(/</g,"%3C").replace(/>/g,"%3E");
    return url;
}

案例/示例:

  • http://foo.bar.已经是绝对immediately.
  • /doo,因此返回相对于根目录的URL.
  • ./meh:返回当前根目录+提供的相对于当前directory.
  • ../booh相对于父目录的相对URL。

该函数将相对路径转换为../,并执行搜索和替换(从http://domain/sub/anything-but-a-slash/../mehttp://domain/sub/me)。

replace_all_rel_by_abs - 转换所有相关的URL

脚本实例(<script>、事件处理程序)中的URL是而不是替换的,因为几乎不可能创建一个快速安全的过滤器来解析JavaScript。

这个脚本里面有一些注释。正则表达式是动态创建的,因为单个RE可以具有3000个字符的大小。<meta http-equiv=refresh content=.. >可以通过各种方式进行混淆,因此RE的大小。

代码语言:javascript
复制
function replace_all_rel_by_abs(html){
    /*HTML/XML Attribute may not be prefixed by these characters (common 
       attribute chars.  This list is not complete, but will be sufficient
       for this function (see http://www.w3.org/TR/REC-xml/#NT-NameChar). */
    var att = "[^-a-z0-9:._]";

    var entityEnd = "(?:;|(?!\\d))";
    var ents = {" ":"(?:\\s|&nbsp;?|&#0*32"+entityEnd+"|&#x0*20"+entityEnd+")",
                "(":"(?:\\(|&#0*40"+entityEnd+"|&#x0*28"+entityEnd+")",
                ")":"(?:\\)|&#0*41"+entityEnd+"|&#x0*29"+entityEnd+")",
                ".":"(?:\\.|&#0*46"+entityEnd+"|&#x0*2e"+entityEnd+")"};
                /* Placeholders to filter obfuscations */
    var charMap = {};
    var s = ents[" "]+"*"; //Short-hand for common use
    var any = "(?:[^>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^>]*";
    /* ^ Important: Must be pre- and postfixed by < and >.
     *   This RE should match anything within a tag!  */

    /*
      @name ae
      @description  Converts a given string in a sequence of the original
                      input and the HTML entity
      @param String string  String to convert
      */
    function ae(string){
        var all_chars_lowercase = string.toLowerCase();
        if(ents[string]) return ents[string];
        var all_chars_uppercase = string.toUpperCase();
        var RE_res = "";
        for(var i=0; i<string.length; i++){
            var char_lowercase = all_chars_lowercase.charAt(i);
            if(charMap[char_lowercase]){
                RE_res += charMap[char_lowercase];
                continue;
            }
            var char_uppercase = all_chars_uppercase.charAt(i);
            var RE_sub = [char_lowercase];
            RE_sub.push("&#0*" + char_lowercase.charCodeAt(0) + entityEnd);
            RE_sub.push("&#x0*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
            if(char_lowercase != char_uppercase){
                /* Note: RE ignorecase flag has already been activated */
                RE_sub.push("&#0*" + char_uppercase.charCodeAt(0) + entityEnd);   
                RE_sub.push("&#x0*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
            }
            RE_sub = "(?:" + RE_sub.join("|") + ")";
            RE_res += (charMap[char_lowercase] = RE_sub);
        }
        return(ents[string] = RE_res);
    }

    /*
      @name by
      @description  2nd argument for replace().
      */
    function by(match, group1, group2, group3){
        /* Note that this function can also be used to remove links:
         * return group1 + "javascript://" + group3; */
        return group1 + rel_to_abs(group2) + group3;
    }
    /*
      @name by2
      @description  2nd argument for replace(). Parses relevant HTML entities
      */
    var slashRE = new RegExp(ae("/"), 'g');
    var dotRE = new RegExp(ae("."), 'g');
    function by2(match, group1, group2, group3){
        /*Note that this function can also be used to remove links:
         * return group1 + "javascript://" + group3; */
        group2 = group2.replace(slashRE, "/").replace(dotRE, ".");
        return group1 + rel_to_abs(group2) + group3;
    }
    /*
      @name cr
      @description            Selects a HTML element and performs a
                                search-and-replace on attributes
      @param String selector  HTML substring to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String marker    Optional RegExp-escaped; marks the prefix
      @param String delimiter Optional RegExp escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to end
                              before an occurence of <end>
     */
    function cr(selector, attribute, marker, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        attribute = att + attribute;
        marker = typeof marker == "string" ? marker : "\\s*=\\s*";
        delimiter = typeof delimiter == "string" ? delimiter : "";
        end = typeof end == "string" ? "?)("+end : ")(";
        var re1 = new RegExp('('+attribute+marker+'")([^"'+delimiter+']+'+end+')', 'gi');
        var re2 = new RegExp("("+attribute+marker+"')([^'"+delimiter+"]+"+end+")", 'gi');
        var re3 = new RegExp('('+attribute+marker+')([^"\'][^\\s>'+delimiter+']*'+end+')', 'gi');
        html = html.replace(selector, function(match){
            return match.replace(re1, by).replace(re2, by).replace(re3, by);
        });
    }
    /* 
      @name cri
      @description            Selects an attribute of a HTML element, and
                                performs a search-and-replace on certain values
      @param String selector  HTML element to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String front     RegExp-escaped; attribute value, prefix to match
      @param String flags     Optional RegExp flags, default "gi"
      @param String delimiter Optional RegExp-escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to end
                                before an occurence of <end>
     */
    function cri(selector, attribute, front, flags, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        attribute = att + attribute;
        flags = typeof flags == "string" ? flags : "gi";
        var re1 = new RegExp('('+attribute+'\\s*=\\s*")([^"]*)', 'gi');
        var re2 = new RegExp("("+attribute+"\\s*=\\s*')([^']+)", 'gi');
        var at1 = new RegExp('('+front+')([^"]+)(")', flags);
        var at2 = new RegExp("("+front+")([^']+)(')", flags);
        if(typeof delimiter == "string"){
            end = typeof end == "string" ? end : "";
            var at3 = new RegExp("("+front+")([^\"'][^"+delimiter+"]*" + (end?"?)("+end+")":")()"), flags);
            var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2).replace(at3, by2)};
        } else {
            var handleAttr = function(match, g1, g2){return g1+g2.replace(at1, by2).replace(at2, by2)};
    }
        html = html.replace(selector, function(match){
             return match.replace(re1, handleAttr).replace(re2, handleAttr);
        });
    }

    /* <meta http-equiv=refresh content="  ; url= " > */
    cri("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+">|'"+ae("refresh")+"'"+any+">|"+ae("refresh")+"(?:"+ae(" ")+any+">|>))", "content", ae("url")+s+ae("=")+s, "i");

    cr("<"+any+att+"href\\s*="+any+">", "href"); /* Linked elements */
    cr("<"+any+att+"src\\s*="+any+">", "src"); /* Embedded elements */

    cr("<object"+any+att+"data\\s*="+any+">", "data"); /* <object data= > */
    cr("<applet"+any+att+"codebase\\s*="+any+">", "codebase"); /* <applet codebase= > */

    /* <param name=movie value= >*/
    cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+">|'"+ae("movie")+"'"+any+">|"+ae("movie")+"(?:"+ae(" ")+any+">|>))", "value");

    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)"); /* <style> */
    cri("<"+any+att+"style\\s*="+any+">", "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")")); /*< style=" url(...) " > */
    return html;
}

以下是私有函数的简短摘要:

  • rel_to_abs(url) -将相对/未知URL转换为绝对URLs将URLs
  • replace_all_rel_by_abs(html)字符串中出现的所有相关URL替换为绝对URL。1. ae - Any E HTML返回一个处理实体的RE模式。

2. by -用替换-这个短函数请求实际的url替换(rel_to_abs)。这个函数可能会被调用数百次,甚至上千次。注意不要在此函数中添加速度较慢的算法(自定义)。

3. cr - Create Replace -创建并执行搜索和替换。

例如:href="..." (在任何超文本标记语言标签内)。

4. cri - Create Replace I-创建并执行搜索和替换。

示例:在HTML标记内的所有style属性中的url(..)

测试用例

打开任意页面,将以下bookmarklet粘贴到地址栏中:

代码语言:javascript
复制
javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/rel_to_abs.js";document.body.appendChild(s)})();

注入的代码包含上面定义的两个函数,外加如下所示的测试用例。备注:测试用例不修改页面的HTML,而是在文本区域中显示解析结果(可选)。

代码语言:javascript
复制
var t=(new Date).getTime();
  var result = replace_all_rel_by_abs(document.documentElement.innerHTML);
  if(confirm((new Date).getTime()-t+" milliseconds to execute\n\nPut results in new textarea?")){
    var txt = document.createElement("textarea");
    txt.style.cssText = "position:fixed;top:0;left:0;width:100%;height:99%"
    txt.ondblclick = function(){this.parentNode.removeChild(this)}
    txt.value = result;
    document.body.appendChild(txt);
}

另请参阅:

票数 44
EN

Stack Overflow用户

发布于 2012-07-12 19:12:41

将urls从相对urls转换为绝对urls的可靠方法是使用内置的url module

示例:

代码语言:javascript
复制
var url = require('url');
url.resolve("http://www.example.org/foo/bar/", "../baz/qux.html");

>> gives 'http://www.example.org/foo/baz/qux.html' 
票数 2
EN

Stack Overflow用户

发布于 2014-11-05 21:53:56

这是当前线程中的Rob W answer "Advanced HTML string replacement functions",加上我为了让JSLint高兴而进行的一些代码重构。

我应该把它贴出来作为答案的评论,但我没有足够的名誉点。

代码语言:javascript
复制
/*jslint browser: true */
/*jslint regexp: true */
/*jslint unparam: true*/
/*jshint strict: false */

/**
 * convertRelToAbsUrl
 *
 * https://stackoverflow.com/a/7544757/1983903
 * 
 * @param  {String} url
 * @return {String} updated url
 */
function convertRelToAbsUrl(url) {
    var baseUrl = null;

    if (/^(https?|file|ftps?|mailto|javascript|data:image\/[^;]{2,9};):/i.test(url)) {
        return url; // url is already absolute
    }

    baseUrl = location.href.match(/^(.+)\/?(?:#.+)?$/)[0] + '/';

    if (url.substring(0, 2) === '//') {
        return location.protocol + url;
    }
    if (url.charAt(0) === '/') {
        return location.protocol + '//' + location.host + url;
    }
    if (url.substring(0, 2) === './') {
        url = '.' + url;
    } else if (/^\s*$/.test(url)) {
        return ''; // empty = return nothing
    }

    url = baseUrl + '../' + url;

    while (/\/\.\.\//.test(url)) {
        url = url.replace(/[^\/]+\/+\.\.\//g, '');
    }

    url = url.replace(/\.$/, '').replace(/\/\./g, '').replace(/"/g, '%22')
            .replace(/'/g, '%27').replace(/</g, '%3C').replace(/>/g, '%3E');

    return url;
}

/**
 * convertAllRelativeToAbsoluteUrls
 *
 * https://stackoverflow.com/a/7544757/1983903
 * 
 * @param  {String} html
 * @return {String} updated html
 */
function convertAllRelativeToAbsoluteUrls(html) {
    var me = this,
        att = '[^-a-z0-9:._]',
        entityEnd = '(?:;|(?!\\d))',
        ents = {
            ' ' : '(?:\\s|&nbsp;?|&#0*32' + entityEnd + '|&#x0*20' + entityEnd + ')',
            '(' : '(?:\\(|&#0*40' + entityEnd + '|&#x0*28' + entityEnd + ')',
            ')' : '(?:\\)|&#0*41' + entityEnd + '|&#x0*29' + entityEnd + ')',
            '.' : '(?:\\.|&#0*46' + entityEnd + '|&#x0*2e' + entityEnd + ')'
        },
        charMap = {},
        s = ents[' '] + '*', // short-hand for common use
        any = '(?:[^>\"\']*(?:\"[^\"]*\"|\'[^\']*\'))*?[^>]*',
        slashRE = null,
        dotRE = null;

    function ae(string) {
        var allCharsLowerCase = string.toLowerCase(),
            allCharsUpperCase = string.toUpperCase(),
            reRes = '',
            charLowerCase = null,
            charUpperCase = null,
            reSub = null,
            i = null;

        if (ents[string]) {
            return ents[string];
        }

        for (i = 0; i < string.length; i++) {
            charLowerCase = allCharsLowerCase.charAt(i);
            if (charMap[charLowerCase]) {
                reRes += charMap[charLowerCase];
                continue;
            }
            charUpperCase = allCharsUpperCase.charAt(i);
            reSub = [charLowerCase];
            reSub.push('&#0*' + charLowerCase.charCodeAt(0) + entityEnd);
            reSub.push('&#x0*' + charLowerCase.charCodeAt(0).toString(16) + entityEnd);

            if (charLowerCase !== charUpperCase) {
                reSub.push('&#0*' + charUpperCase.charCodeAt(0) + entityEnd);
                reSub.push('&#x0*' + charUpperCase.charCodeAt(0).toString(16) + entityEnd);
            }
            reSub = '(?:' + reSub.join('|') + ')';
            reRes += (charMap[charLowerCase] = reSub);
        }
        return (ents[string] = reRes);
    }

    function by(match, group1, group2, group3) {
        return group1 + me.convertRelToAbsUrl(group2) + group3;
    }

    slashRE = new RegExp(ae('/'), 'g');
    dotRE = new RegExp(ae('.'), 'g');

    function by2(match, group1, group2, group3) {
        group2 = group2.replace(slashRE, '/').replace(dotRE, '.');
        return group1 + me.convertRelToAbsUrl(group2) + group3;
    }

    function cr(selector, attribute, marker, delimiter, end) {
        var re1 = null,
            re2 = null,
            re3 = null;

        if (typeof selector === 'string') {
            selector = new RegExp(selector, 'gi');
        }

        attribute = att + attribute;
        marker = typeof marker === 'string' ? marker : '\\s*=\\s*';
        delimiter = typeof delimiter === 'string' ? delimiter : '';
        end = typeof end === 'string' ? '?)(' + end : ')(';

        re1 = new RegExp('(' + attribute + marker + '")([^"' + delimiter + ']+' + end + ')', 'gi');
        re2 = new RegExp('(' + attribute + marker + '\')([^\'' + delimiter + ']+' + end + ')', 'gi');
        re3 = new RegExp('(' + attribute + marker + ')([^"\'][^\\s>' + delimiter + ']*' + end + ')', 'gi');

        html = html.replace(selector, function (match) {
            return match.replace(re1, by).replace(re2, by).replace(re3, by);
        });
    }

    function cri(selector, attribute, front, flags, delimiter, end) {
        var re1 = null,
            re2 = null,
            at1 = null,
            at2 = null,
            at3 = null,
            handleAttr = null;

        if (typeof selector === 'string') {
            selector = new RegExp(selector, 'gi');
        }

        attribute = att + attribute;
        flags = typeof flags === 'string' ? flags : 'gi';
        re1 = new RegExp('(' + attribute + '\\s*=\\s*")([^"]*)', 'gi');
        re2 = new RegExp("(" + attribute + "\\s*=\\s*')([^']+)", 'gi');
        at1 = new RegExp('(' + front + ')([^"]+)(")', flags);
        at2 = new RegExp("(" + front + ")([^']+)(')", flags);

        if (typeof delimiter === 'string') {
            end = typeof end === 'string' ? end : '';
            at3 = new RegExp('(' + front + ')([^\"\'][^' + delimiter + ']*' + (end ? '?)(' + end + ')' : ')()'), flags);
            handleAttr = function (match, g1, g2) {
                return g1 + g2.replace(at1, by2).replace(at2, by2).replace(at3, by2);
            };
        } else {
            handleAttr = function (match, g1, g2) {
                return g1 + g2.replace(at1, by2).replace(at2, by2);
            };
        }
        html = html.replace(selector, function (match) {
            return match.replace(re1, handleAttr).replace(re2, handleAttr);
        });
    }

    cri('<meta' + any + att + 'http-equiv\\s*=\\s*(?:\"' + ae('refresh')
        + '\"' + any + '>|\'' + ae('refresh') + '\'' + any + '>|' + ae('refresh')
        + '(?:' + ae(' ') + any + '>|>))', 'content', ae('url') + s + ae('=') + s, 'i');

    cr('<' + any + att + 'href\\s*=' + any + '>', 'href'); /* Linked elements */
    cr('<' + any + att + 'src\\s*=' + any + '>', 'src'); /* Embedded elements */

    cr('<object' + any + att + 'data\\s*=' + any + '>', 'data'); /* <object data= > */
    cr('<applet' + any + att + 'codebase\\s*=' + any + '>', 'codebase'); /* <applet codebase= > */

    /* <param name=movie value= >*/
    cr('<param' + any + att + 'name\\s*=\\s*(?:\"' + ae('movie') + '\"' + any + '>|\''
        + ae('movie') + '\'' + any + '>|' + ae('movie') + '(?:' + ae(' ') + any + '>|>))', 'value');

    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi,
        'url', '\\s*\\(\\s*', '', '\\s*\\)'); /* <style> */
    cri('<' + any + att + 'style\\s*=' + any + '>', 'style',
        ae('url') + s + ae('(') + s, 0, s + ae(')'), ae(')')); /*< style=" url(...) " > */

    return html;
}

票数 1
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/7544550

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档