首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >结果与IRB不同

结果与IRB不同
EN

Stack Overflow用户
提问于 2013-11-01 20:55:02
回答 2查看 121关注 0票数 1

我跳过很多圈得到了这根绳子:

代码语言:javascript
复制
"<html>\n<head>\n<script language=\"JavaScript\">  \n\n        ////////////////////////////////////////////////////////////////  \n        // This [base64 encoder and decoder] was written by Tyler Akins and has been placed in the  \n        // public domain.  It would be nice if you left this header intact.  \n        // Base64 code from Tyler Akins -- http://rumkin.com  \n        var keyStr = \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\";  \n\n        function encode64(input) {  \n           var output = \"\";  \n           var chr1, chr2, chr3;  \n           var enc1, enc2, enc3, enc4;  \n           var i = 0;  \n\n           do {  \n              chr1 = input.charCodeAt(i++);  \n              chr2 = input.charCodeAt(i++);  \n              chr3 = input.charCodeAt(i++);  \n\n              enc1 = chr1 >> 2;  \n              enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);  \n              enc3 = ((chr2 & 15) << 2) | (chr3 >> 6);  \n              enc4 = chr3 & 63;  \n\n              if (isNaN(chr2)) {  \n                 enc3 = enc4 = 64;  \n              } else if (isNaN(chr3)) {  \n                 enc4 = 64;  \n              }  \n\n              output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) +  \n                 keyStr.charAt(enc3) + keyStr.charAt(enc4);  \n           } while (i < input.length);  \n\n           return output;  \n        }  \n        function decode64(input) {  \n           var output = \"\";  \n           var chr1, chr2, chr3;  \n           var enc1, enc2, enc3, enc4;  \n           var i = 0;  \n\n           // remove all characters that are not A-Z, a-z, 0-9, +, /, or =  \n           input = input.replace(/[^A-Za-z0-9\\+\\/\\=]/g, \"\");  \n\n           do {  \n              enc1 = keyStr.indexOf(input.charAt(i++));  \n              enc2 = keyStr.indexOf(input.charAt(i++));  \n              enc3 = keyStr.indexOf(input.charAt(i++));  \n              enc4 = keyStr.indexOf(input.charAt(i++));  \n\n              chr1 = (enc1 << 2) | (enc2 >> 4);  \n              chr2 = ((enc2 & 15) << 4) | (enc3 >> 2);  \n              chr3 = ((enc3 & 3) << 6) | enc4;  \n\n              output = output + String.fromCharCode(chr1);  \n\n              if (enc3 != 64) {  \n                 output = output + String.fromCharCode(chr2);  \n              }  \n              if (enc4 != 64) {  \n                 output = output + String.fromCharCode(chr3);  \n              }  \n           } while (i < input.length);  \n\n           return output;  \n        }  \n\n        // end of Tyler Akins' code  \n        ////////////////////////////////////////////////////////////////  \n  function escapePluses(s) {  \n       return s.replace(/\\+/g, \"%2B\");  \n  }  \n  function getFragment(thisuri) {  \n      var pound = thisuri.indexOf(\"#\");  \n      if (pound == -1) {  \n          return null;  \n      } else {  \n          return thisuri.substr(pound + 1);  \n      }  \n  }  \n  function saveFragment() {  \n      var fragment = getFragment(document.URL);  \n      if (fragment != null) {  \n          var pre_marker  = \"&aka_frag=\";  \n          var g_req = decode64(document.relay.pubcookie_g_req.value);  \n          var header_end = g_req.indexOf(pre_marker) + pre_marker.length;  \n          var req_head = g_req.substr(0,header_end);  \n          var req_foot = g_req.substr(header_end);  \n         if ((req_foot.length > 0) && (req_foot.charAt(0) != '&')) {  \n              req_foot = req_foot.substr(req_foot.indexOf(\"&\"));  \n          }  \n         var new_req = req_head + escapePluses(encode64(fragment)) + req_foot;  \n          document.relay.pubcookie_g_req.value = encode64(new_req);  \n      }  \n  }  \n\n  function doStuff() {  \n      saveFragment();  \n      document.relay.submit();  \n  }  \n\n//  setTimeout('doStuff()', 1000);  \n</script></head>\n<body onLoad=\"doStuff()\">\n<form method=post action=\"https://weblogin.server.com/\" name=relay>\n<input type=hidden name=pubcookie_g_req value=\"b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA==\">\n<input type=hidden name=post_stuff value=\"\">\n<input type=hidden name=relay_url value=\"https://siam-pro.qa.server.com/PubCookie.reply\">\n<noscript>\n<p align=center>You do not have Javascript turned on,   please click the button to continue.\n<p align=center>\n<input type=submit name=go value=Continue>\n</noscript>\n</form>\n</html>\n"

然后,我想匹配这个正则表达式上的字符串:

代码语言:javascript
复制
<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>

这与预期的工作方式相同,但在IRB (1.9.3)中,我得到了以下内容:

代码语言:javascript
复制
1.9.3p448 :147 > data =~/<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>/
=> nil 

我在这里做错什么了?

EN

回答 2

Stack Overflow用户

回答已采纳

发布于 2013-11-01 21:11:49

您需要在这里使用多行正则表达式--使用m修饰符启用多行匹配。

代码语言:javascript
复制
/<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>/m
票数 0
EN

Stack Overflow用户

发布于 2013-11-01 21:36:09

Regex和HTML/XML不是好伙伴。当HTML发生变化时,可能性很大,您的模式将被打破。解析器显着地减少了代码中断的可能性。例如,很容易预料到标记中的参数可以更改它们的顺序:

代码语言:javascript
复制
<form method="post" action="https://weblogin.server.com/" name="relay">

表格可改为下列其中之一:

代码语言:javascript
复制
<form method="post" action="https://weblogin.server.com/" name="relay" >...</form>
<form method="post" action="https://weblogin.server.com/" name="relay1" >...</form>
<form name="relay" method="post" action="https://weblogin.server.com/">...</form>
<form name="relay" method="post" action="https://weblogin.server.com/">...</form >

如果其中任何一个发生,判决立即被打破。

解析器不会关心这些更改。

代码语言:javascript
复制
require 'nokogiri'

html = "<html>\n<head>\n<script language=\"JavaScript\">  \n\n        ////////////////////////////////////////////////////////////////  \n        // This [base64 encoder and decoder] was written by Tyler Akins and has been placed in the  \n        // public domain.  It would be nice if you left this header intact.  \n        // Base64 code from Tyler Akins -- http://rumkin.com  \n        var keyStr = \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\";  \n\n        function encode64(input) {  \n           var output = \"\";  \n           var chr1, chr2, chr3;  \n           var enc1, enc2, enc3, enc4;  \n           var i = 0;  \n\n           do {  \n              chr1 = input.charCodeAt(i++);  \n              chr2 = input.charCodeAt(i++);  \n              chr3 = input.charCodeAt(i++);  \n\n              enc1 = chr1 >> 2;  \n              enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);  \n              enc3 = ((chr2 & 15) << 2) | (chr3 >> 6);  \n              enc4 = chr3 & 63;  \n\n              if (isNaN(chr2)) {  \n                 enc3 = enc4 = 64;  \n              } else if (isNaN(chr3)) {  \n                 enc4 = 64;  \n              }  \n\n              output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) +  \n                 keyStr.charAt(enc3) + keyStr.charAt(enc4);  \n           } while (i < input.length);  \n\n           return output;  \n        }  \n        function decode64(input) {  \n           var output = \"\";  \n           var chr1, chr2, chr3;  \n           var enc1, enc2, enc3, enc4;  \n           var i = 0;  \n\n           // remove all characters that are not A-Z, a-z, 0-9, +, /, or =  \n           input = input.replace(/[^A-Za-z0-9\\+\\/\\=]/g, \"\");  \n\n           do {  \n              enc1 = keyStr.indexOf(input.charAt(i++));  \n              enc2 = keyStr.indexOf(input.charAt(i++));  \n              enc3 = keyStr.indexOf(input.charAt(i++));  \n              enc4 = keyStr.indexOf(input.charAt(i++));  \n\n              chr1 = (enc1 << 2) | (enc2 >> 4);  \n              chr2 = ((enc2 & 15) << 4) | (enc3 >> 2);  \n              chr3 = ((enc3 & 3) << 6) | enc4;  \n\n              output = output + String.fromCharCode(chr1);  \n\n              if (enc3 != 64) {  \n                 output = output + String.fromCharCode(chr2);  \n              }  \n              if (enc4 != 64) {  \n                 output = output + String.fromCharCode(chr3);  \n              }  \n           } while (i < input.length);  \n\n           return output;  \n        }  \n\n        // end of Tyler Akins' code  \n        ////////////////////////////////////////////////////////////////  \n  function escapePluses(s) {  \n       return s.replace(/\\+/g, \"%2B\");  \n  }  \n  function getFragment(thisuri) {  \n      var pound = thisuri.indexOf(\"#\");  \n      if (pound == -1) {  \n          return null;  \n      } else {  \n          return thisuri.substr(pound + 1);  \n      }  \n  }  \n  function saveFragment() {  \n      var fragment = getFragment(document.URL);  \n      if (fragment != null) {  \n          var pre_marker  = \"&aka_frag=\";  \n          var g_req = decode64(document.relay.pubcookie_g_req.value);  \n          var header_end = g_req.indexOf(pre_marker) + pre_marker.length;  \n          var req_head = g_req.substr(0,header_end);  \n          var req_foot = g_req.substr(header_end);  \n         if ((req_foot.length > 0) && (req_foot.charAt(0) != '&')) {  \n              req_foot = req_foot.substr(req_foot.indexOf(\"&\"));  \n          }  \n         var new_req = req_head + escapePluses(encode64(fragment)) + req_foot;  \n          document.relay.pubcookie_g_req.value = encode64(new_req);  \n      }  \n  }  \n\n  function doStuff() {  \n      saveFragment();  \n      document.relay.submit();  \n  }  \n\n//  setTimeout('doStuff()', 1000);  \n</script></head>\n<body onLoad=\"doStuff()\">\n<form method=post action=\"https://weblogin.server.com/\" name=relay>\n<input type=hidden name=pubcookie_g_req value=\"b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA==\">\n<input type=hidden name=post_stuff value=\"\">\n<input type=hidden name=relay_url value=\"https://siam-pro.qa.server.com/PubCookie.reply\">\n<noscript>\n<p align=center>You do not have Javascript turned on,   please click the button to continue.\n<p align=center>\n<input type=submit name=go value=Continue>\n</noscript>\n</form>\n</html>\n"

doc = Nokogiri::HTML(html)

form = doc.at('form')
puts form.to_html
# >> <form method="post" action="https://weblogin.server.com/" name="relay">
# >> <input type="hidden" name="pubcookie_g_req" value="b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA=="><input type="hidden" name="post_stuff" value=""><input type="hidden" name="relay_url" value="https://siam-pro.qa.server.com/PubCookie.reply"><noscript>
# >> <p align="center">You do not have Javascript turned on,   please click the button to continue.
# >> </p>
# >> <p align="center">
# >> <input type="submit" name="go" value="Continue"></p>
# >> </noscript>
# >> </form>

form['action'] # => "https://weblogin.server.com/"
input = form.at('input')
input['name'] # => "pubcookie_g_req"
input['value'] # => "b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA=="

诺科吉里是Ruby最受欢迎的XML/HTML解析器。它速度快,使用方便,而且在我的使用中非常健壮。

票数 3
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/19735262

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档