文章/答案/技术大牛

发布

问网络抓取
EN

Stack Overflow用户

提问于 2017-09-29 05:38:02

回答 1查看 58关注 0票数 1

 private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);
            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {
                int index = hrefValue.IndexOf("&");
                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
                }

            }
            List<string> values = new List<string>();

            string SourceCode = worker.GetSourceCode(SearchResults);

            MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

            foreach (Match m in data)
            {

                string value = m.Groups[1].Value;
                value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
                values.Add(value);

                sw.Write(value);
            }
        }
        sw.Close(); ;
    }



       public static string GetSourceCode(string url)
    {
         HttpWebRequest reg = (HttpWebRequest)WebRequest.Create(url);
        HttpWebResponse resp = (HttpWebResponse)reg.GetResponse();
        StreamReader sr = new 
       StreamReader(resp.GetResponseStream(),System.Text.UTF8Encoding.UTF8);
        string SourceCode = sr.ReadToEnd();
        sr.Close();
        resp.Close();
        return SourceCode

大家好。我正在准备一个Windows表单应用程序来抓取。我将从windows表单中输入一些表达式，并在google中自动搜索该表达式。程序将显示我在列表框中找到的链接，并显示链接包含在文本文件中(链接中的信函)。显示链接很好，但是程序不记录文本文件中的链接内容。

我试过调试mode.As，结果程序没有进入该代码块。

foreach(Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);
        }

我试着显示链接代码块和记录链接内容代码块，他们的seperately.Both工作得很好。当我试图组合它们时，code.no无法得到一个工作的work.Please错误，但是work.Please没有提供帮助。

regex

winforms

web-scraping

回答 1

Stack Overflow用户

回答已采纳

发布于 2017-09-29 07:04:23

    private void Clicked(object sender, EventArgs e)
    {
        List<string> values = new List<string>();
        string url = textBox1.Text;
        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);
            StreamWriter sw = File.AppendText("website.txt");
            sw.Write(value);
            sw.Close(); ;
        }

    }

    private void button2_Click(object sender, EventArgs e)
    {
        listBox1.Items.Clear();
        StringBuilder sb = new StringBuilder();
        byte[] ResultsBuffer = new byte[8192];
        string SearchResults = "http://google.com/search?q=" + textBox2.Text.Trim();//txtKeyWords? Anladigim texte girilen deger
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
        HttpWebResponse response = (HttpWebResponse)request.GetResponse();

        Stream resStream = response.GetResponseStream();
        string tempString = null;
        int count = 0;
        do
        {
            count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
            if (count != 0)
            {
                tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
                sb.Append(tempString);
            }
        }

        while (count > 0);
        string sbb = sb.ToString();

        HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
        html.OptionOutputAsXml = true;
        html.LoadHtml(sbb);
        HtmlNode doc = html.DocumentNode;
        //StreamWriter sw = File.AppendText("website.txt");
        foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
        {
            HtmlAttribute att = link.Attributes["href"];
            string hrefValue = link.GetAttributeValue("href", string.Empty);

            if (!hrefValue.ToString().ToUpper().Contains("GOOGLE") && hrefValue.ToString().Contains("/url?q=") && hrefValue.ToString().ToUpper().Contains("HTTP://"))
            {

                int index = hrefValue.IndexOf("&");

                if (index > 0)
                {
                    hrefValue = hrefValue.Substring(0, index);
                    hrefValue = hrefValue.Replace("/url?q=", "");
                    listBox1.Items.Add(hrefValue);
                    GetData(hrefValue);
                }                  
            }              
        }            
    }

    private void GetData(string url)
    {
        StreamWriter sw = File.AppendText("website.txt");

        List<string> values = new List<string>();

        string SourceCode = worker.GetSourceCode(url);

        MatchCollection data = Regex.Matches(SourceCode, @"<p>\s*(.+?)\s*</p>", RegexOptions.Singleline);

        foreach (Match m in data)
        {

            string value = m.Groups[1].Value;
            value = value.Replace("&rsquo;", "'").Replace("<strong>", "").Replace("</strong>", "").Replace("Ouml;z", "Ö").Replace("&ouml;", "ö").Replace("&uuml;", "ü").Replace("&ccedil;", "ç");
            values.Add(value);

            sw.Write(value);

        }
        sw.Close();
    }

    private void listBox1_SelectedIndexChanged(object sender, EventArgs e)
    {

    }

    private void label3_Click(object sender, EventArgs e)
    {

    }

    private void label2_Click(object sender, EventArgs e)
    {

    }


}

}

我终于成功地发射了。这是答案。只是在我的回答中留下了一些问题。它们都是关于正则表达式的。因为网站的html代码没有标准的概念。所以它需要用正则表达式来修正。当ı完成我的项目时，ı将共享我的全部代码。

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/46482400

复制

相似问题

问网络抓取
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问网络抓取EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问网络抓取
EN