C# RSS: 기사 본문 캡처 및 TXT

35675 단어 txt
하면, 만약, 만약...
숙소 핸드폰에 CMNET 신호가 거의 없으면...
하면, 만약, 만약...
만약 네가 자기 전에 오늘의 뉴스를 조용히 훑어보고 싶다면
다음 프로그램은 cnblogs, cnbeta, 인터넷 깊이, 남방 주말의 첫 페이지 본문을 캡처하여 다른 사이트를 추가할 수 있습니다.
 
using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;

using System.Net;

using System.Collections;

using System.Threading;

using System.IO;

using System.Configuration;

namespace RSS

{

    class Program

    {

      

        static void Main(string[] args)

        {

            string file="i://";



            {

                GetItem gi1 = new GetItem();

                gi1.pageUrl = "http://news.cnblogs.com/n/page/";

                gi1.prefix = "http://news.cnblogs.com";

                gi1.pageUrlsRegex = "\"(?<url>/n/[\\d]+?)\"";

                gi1.titleRegex = "<div id=\"news_title\"><a.*?>(?<title>.*?)</a>";

                gi1.timeRegex = "<span class=\"time\">(?<time>.*?)</span>";

                gi1.bodyRegex = "<div id=\"news_body\">(?<body>.*?)</div>";

                gi1.hostName = "CnBlogs";

                gi1.encoding = "utf-8";

                gi1.fileSave = string.Format("{2}{0}_{1}.txt", gi1.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

                Console.WriteLine(gi1.fileSave);

                gi1.pageWantToGet = 20;

                gi1.threadStart();



            }



            //{

            //    GetItem gi2 = new GetItem();

            //    gi2.prefix = "http://www.cnbeta.com/";

            //    gi2.pageUrlsRegex = "\"(?<url>/articles/[\\d]+.htm?)\"";

            //    gi2.titleRegex = "id=\"news_title\">(?<title>.*?)</h3>";

            //    gi2.timeRegex = "id=\"news_author\"><span>(?<time>.*?)[|]";

            //    gi2.bodyRegex = "<div id=\"news_content\">(?<body>.*?)<!-- end newsBox news -->";

            //    gi2.hostName = "CnBeta";

            //    gi2.encoding = "gb2312";

            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

            //    Console.WriteLine(gi2.fileSave);

            //    gi2.homeOnly = true;

            //    gi2.threadStart();



            //}



            //{

            //    GetItem gi2 = new GetItem();

            //    gi2.pageUrlsRegex = "\"(?<url>http://focus.news.163.com.[^>< ]*.html?)\"";

            //    gi2.prefix = "http://focus.news.163.com/";

            //    gi2.hasPrefix = false;//default:true

            //    gi2.hasManyPage = true;//default:false

            //    gi2.manyPageRegex = "<span class=\"s1 s3\">   </span>(?<np>.*?)   </a>";

            //    gi2.titleRegex = "id=\"h1title\">(?<title>.*?)</h1>";

            //    gi2.timeRegex = "<span class=\"info\">(?<time>.*?)  ";

            //    gi2.bodyRegex = "class=\"summary\"(?<body>.*?)<!--    -->";

            //    gi2.hostName = "163";

            //    gi2.encoding = "GBK";

            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

            //    Console.WriteLine(gi2.fileSave);

            //    gi2.homeOnly = true;

            //    gi2.threadStart();



            //}

            //{

            //    GetItem gi2 = new GetItem();

            //    gi2.pageUrlsRegex = "\"(?<url>http://www.infzm.com/content/[\\d]+?)\"";

            //    gi2.prefix = "http://www.infzm.com/";

            //    gi2.hasPrefix = false;//default:true

            //    gi2.hasManyPage = false;//default:false

            //    //gi2.manyPageRegex = "<span class=\"s1 s3\">   </span>(?<np>.*?)   </a>";

            //    gi2.titleRegex = "<div id=\"detailContent\">[\\s]*<h1>[\\s]*(?<title>.*?)[\\s]*</h1>";

            //    gi2.timeRegex = "<span class=\"pubTime\">(?<time>.*?)</span>";

            //    gi2.bodyRegex = "<div id=\"content-context\">(?<body>.*?)<!--end #text-->";

            //    gi2.hostName = "infzm";

            //    gi2.encoding = "utf-8";

            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);

            //    Console.WriteLine(gi2.fileSave);

            //    gi2.homeOnly = true;

            //    gi2.threadStart();



            //}

            //Console.Read();

            

        }

    }

    class GetItem{

        public string pageUrl;

        public bool homeOnly = false;

        public bool hasPrefix = true;

        public int pageWantToGet = 1;

        public bool hasManyPage = false;

        public string manyPageRegex;

        public string prefix;

        private List<string> pageUrls;

        public string pageUrlsRegex;

        public string titleRegex;

        public string timeRegex;

        public string bodyRegex;

        public string fileSave;

        public string hostName;

        public string encoding;

        public void threadStart() {



            if(!prefix.EndsWith("/"))prefix+="/";

            ThreadStart ts = new ThreadStart(start);

            Thread th = new Thread(ts);

            th.Start();

            

        }

        private void start() {



            if (homeOnly) { 

                

                getPageUrls(-1);



            }

            else

            {



                for (int i = 1; i <= pageWantToGet; i++)

                    getPageUrls(i);

            }

            startGetAll();

        }

        private void WriteFile(string str) {

            FileStream fs = new FileStream(fileSave, FileMode.Append);

            StreamWriter streamWriter = new StreamWriter(fs,System.Text.Encoding.GetEncoding("gb2312"));

            streamWriter.WriteLine(str);

            streamWriter.Flush();

            streamWriter.Close();

            fs.Close();

        }

        private void deleteTag(ref string str)

        {

   

            str = Regex.Replace(str, "<[\\s]*p[^>]*>?>", "");

            str = Regex.Replace(str, "</[\\s]*p[\\s]*?>", "\r
"); str = Regex.Replace(str, "<[\\s]*br[\\s]*/[\\s]*[^>]*>?>", "\r
"); str = Regex.Replace(str, "<[\\s]*br[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*br[^>]*>?>", "\r
"); str = Regex.Replace(str, "<[\\s]*a[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*a[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*strong[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*strong[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*div[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*div[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*b[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*b[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*span[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*span[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*script[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*script[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*li[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*li[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*style[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*style[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*i[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*i[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*h3[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*h2[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*h3[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*h2[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*font[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*font[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "<[\\s]*q[\\s]*[^>]*>?>", ""); str = Regex.Replace(str, "</[\\s]*q[\\s]*[^>]*>?>", ""); str = str.Replace("&rdquo;", "\""); str = str.Replace("&ldquo;", "\""); str = str.Replace("&lsquo;", "'"); str = str.Replace("&rsquo;", "'"); str = str.Replace("&nbsp;", " "); str = str.Replace("&hellip;", ""); str = str.Replace("&ndash;", "-"); str = str.Replace("&mdash;", ""); } public GetItem() { //this.homeUrl = url; pageUrls = new List<string>(50); } private string getNextPageContent(string url) { Console.WriteLine(url); //Console.Read(); try { HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url); //req.SendChunked = true; req.Method = "get"; req.ContentType = "text/html;charset=utf-8"; //req.AllowAutoRedirect = false; // req.Timeout = 50; //req.CookieContainer = cc; StringBuilder sb = new StringBuilder(""); StringBuilder cont = new StringBuilder(""); using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse) { System.IO.Stream respStream = wr.GetResponseStream(); System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding)); //Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline); //Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline); Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline); do { sb.Append(reader.ReadLine()); } while (!reader.EndOfStream); string str = sb.ToString(); //Console.WriteLine(sb); //Match m = titler.Match(str); //if (m.Success) //{ // Console.WriteLine("title:{0}", m.Groups["title"].Value); // //streamWriter.WriteLine(m.Groups["title"].Value); // cont.AppendLine(m.Groups["title"].Value); //} //cont.AppendLine(url); //m = timer.Match(str); //if (m.Success) //{ // Console.WriteLine("time:{0}", m.Groups["time"].Value); // cont.AppendLine(m.Groups["time"].Value); //} Match m = bodyr.Match(str); if (m.Success) { string body = m.Groups["body"].Value; deleteTag(ref body); Console.WriteLine(" "); return body; } } } catch (Exception ex) { Console.WriteLine(" :{0}",ex.Message); return ""; } return ""; } private void getContent(string url,int index,int total) { Console.WriteLine(url); //Console.Read(); try { HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url); req.Method = "get"; req.ContentType = " text/html;charset=utf-8"; //req. //req.AllowAutoRedirect = false; // req.Timeout = 50; //req.CookieContainer = cc; StringBuilder sb = new StringBuilder(""); StringBuilder cont = new StringBuilder(""); using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse) { System.IO.Stream respStream = wr.GetResponseStream(); System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding)); Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline); Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline); Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline); do { sb.Append(reader.ReadLine()); } while (!reader.EndOfStream); string str = sb.ToString(); //Console.WriteLine(sb); Match m = titler.Match(str); if (m.Success) { Console.WriteLine("title:{0}",m.Groups["title"].Value); //streamWriter.WriteLine(m.Groups["title"].Value); cont.AppendLine(m.Groups["title"].Value); } cont.AppendLine(string.Format("({0}/{1}){2}",index,total,url)); m = timer.Match(str); if (m.Success) { Console.WriteLine("time:{0}", m.Groups["time"].Value); cont.AppendLine(m.Groups["time"].Value); } m = bodyr.Match(str); if (m.Success) { string body = m.Groups["body"].Value; deleteTag(ref body); Console.WriteLine(" "); cont.AppendLine(body); } if (hasManyPage) { Regex mr = new Regex(this.manyPageRegex, RegexOptions.Singleline); Match mm = mr.Match(str); if (mm.Success) { Console.WriteLine(" .."); string pagesurl = mm.Groups["np"].Value; Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline); MatchCollection mc = r.Matches(pagesurl); for (int i = 0; i < mc.Count; i++) { string u = mc[i].Groups["url"].Value; if (pageUrls.IndexOf(u) == -1) { pageUrls.Add(u); cont.AppendLine(getNextPageContent(u)); } } } } cont.AppendLine("--------------------------------------------------------------"); WriteFile(cont.ToString()); } } catch (Exception ex) { Console.WriteLine(" :{0},{1}",ex.Source,ex.Message); return; } } private void startGetAll() { for (int i = 0; i < pageUrls.Count; i++) { string u; if (hasPrefix) { if (pageUrls[i].StartsWith("/")) u = string.Format("{0}{1}", prefix, pageUrls[i].Substring(1)); else u = string.Format("{0}{1}", prefix, pageUrls[i]); } else u = pageUrls[i]; getContent(u, i, pageUrls.Count); } } private void getPageUrls(int pageIndex) { string url; if (pageIndex == -1) url = prefix; else url = string.Format("{0}{1}",this.pageUrl,pageIndex); Console.WriteLine(url); try { HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url); req.Method = "get"; req.ContentType = " text/html;charset=utf-8"; //req.AllowAutoRedirect = false; // req.Timeout = 50; //req.CookieContainer = cc; StringBuilder sb = new StringBuilder(""); using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse) { System.IO.Stream respStream = wr.GetResponseStream(); System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding)); Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline); do { sb.Append(reader.ReadLine()); } while (!reader.EndOfStream); // Console.WriteLine(sb); MatchCollection m = r.Matches(sb.ToString()); //Console.WriteLine("regex:{0},matches:{1}", this.pageUrlsRegex, m.Count); for (int i = 0; i < m.Count; i++) { string temp = m[i].Groups["url"].Value; //Console.WriteLine("index:{0},{1}", pageUrls.IndexOf(temp), temp); if (pageUrls.IndexOf(temp) == -1) pageUrls.Add(temp); } Console.WriteLine("{0}:{1} articles.",this.hostName,pageUrls.Count); } } catch (Exception ex) { Console.WriteLine(ex.Message); Console.WriteLine("{0} end!", this.hostName); return; } Console.WriteLine("{0} end!", this.hostName); } } }

참고: 프로젝트->add item->new xml file:app.config
like this:
                 

좋은 웹페이지 즐겨찾기