首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >网络抓取

网络抓取
EN

Stack Overflow用户
提问于 2017-09-29 05:38:02
回答 1查看 58关注 0票数 1
代码语言:javascript
复制
 private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);
            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {
                int index = hrefValue.IndexOf("&");
                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
                }

            }
            List<string> values = new List<string>();

            string SourceCode = worker.GetSourceCode(SearchResults);

            MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

            foreach (Match m in data)
            {

                string value = m.Groups[1].Value;
                value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
                values.Add(value);

                sw.Write(value);
            }
        }
        sw.Close(); ;
    }



       public static string GetSourceCode(string url)
    {
         HttpWebRequest reg = (HttpWebRequest)WebRequest.Create(url);
        HttpWebResponse resp = (HttpWebResponse)reg.GetResponse();
        StreamReader sr = new 
       StreamReader(resp.GetResponseStream(),System.Text.UTF8Encoding.UTF8);
        string SourceCode = sr.ReadToEnd();
        sr.Close();
        resp.Close();
        return SourceCode

大家好。我正在准备一个Windows表单应用程序来抓取。我将从windows表单中输入一些表达式,并在google中自动搜索该表达式。程序将显示我在列表框中找到的链接,并显示链接包含在文本文件中(链接中的信函)。显示链接很好,但是程序不记录文本文件中的链接内容。

我试过调试mode.As,结果程序没有进入该代码块。

代码语言:javascript
复制
foreach(Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);
        }

我试着显示链接代码块和记录链接内容代码块,他们的seperately.Both工作得很好。当我试图组合它们时,code.no无法得到一个工作的work.Please错误,但是work.Please没有提供帮助。

EN

回答 1

Stack Overflow用户

回答已采纳

发布于 2017-09-29 07:04:23

代码语言:javascript
复制
    private void Clicked(object sender, EventArgs e)
    {
        List<string> values = new List<string>();
        string url = textBox1.Text;
        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);
            StreamWriter sw = File.AppendText("website.txt");
            sw.Write(value);
            sw.Close(); ;
        }

    }

    private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        //StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);

            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {

                int index = hrefValue.IndexOf("&");

                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    hrefValue = hrefValue.Replace("/url?q=", "");
                    listBox1.Items.Add(hrefValue);
                    GetData(hrefValue);
                }                  
            }              
        }            
    }

    private void GetData(string url)
    {
        StreamWriter sw = File.AppendText("website.txt");

        List<string> values = new List<string>();

        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);

        }
        sw.Close();
    }

    private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
    {

    }

    private void label3_Click(object sender, EventArgs e)
    {

    }

    private void label2_Click(object sender, EventArgs e)
    {

    }


}

}

我终于成功地发射了。这是答案。只是在我的回答中留下了一些问题。它们都是关于正则表达式的。因为网站的html代码没有标准的概念。所以它需要用正则表达式来修正。当ı完成我的项目时,ı将共享我的全部代码。

票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/46482400

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档