C# RSS: 기사 본문 캡처 및 TXT
35675 단어 txt
숙소 핸드폰에 CMNET 신호가 거의 없으면...
하면, 만약, 만약...
만약 네가 자기 전에 오늘의 뉴스를 조용히 훑어보고 싶다면
다음 프로그램은 cnblogs, cnbeta, 인터넷 깊이, 남방 주말의 첫 페이지 본문을 캡처하여 다른 사이트를 추가할 수 있습니다.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.Collections;
using System.Threading;
using System.IO;
using System.Configuration;
namespace RSS
{
class Program
{
static void Main(string[] args)
{
string file="i://";
{
GetItem gi1 = new GetItem();
gi1.pageUrl = "http://news.cnblogs.com/n/page/";
gi1.prefix = "http://news.cnblogs.com";
gi1.pageUrlsRegex = "\"(?<url>/n/[\\d]+?)\"";
gi1.titleRegex = "<div id=\"news_title\"><a.*?>(?<title>.*?)</a>";
gi1.timeRegex = "<span class=\"time\">(?<time>.*?)</span>";
gi1.bodyRegex = "<div id=\"news_body\">(?<body>.*?)</div>";
gi1.hostName = "CnBlogs";
gi1.encoding = "utf-8";
gi1.fileSave = string.Format("{2}{0}_{1}.txt", gi1.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
Console.WriteLine(gi1.fileSave);
gi1.pageWantToGet = 20;
gi1.threadStart();
}
//{
// GetItem gi2 = new GetItem();
// gi2.prefix = "http://www.cnbeta.com/";
// gi2.pageUrlsRegex = "\"(?<url>/articles/[\\d]+.htm?)\"";
// gi2.titleRegex = "id=\"news_title\">(?<title>.*?)</h3>";
// gi2.timeRegex = "id=\"news_author\"><span>(?<time>.*?)[|]";
// gi2.bodyRegex = "<div id=\"news_content\">(?<body>.*?)<!-- end newsBox news -->";
// gi2.hostName = "CnBeta";
// gi2.encoding = "gb2312";
// gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
// Console.WriteLine(gi2.fileSave);
// gi2.homeOnly = true;
// gi2.threadStart();
//}
//{
// GetItem gi2 = new GetItem();
// gi2.pageUrlsRegex = "\"(?<url>http://focus.news.163.com.[^>< ]*.html?)\"";
// gi2.prefix = "http://focus.news.163.com/";
// gi2.hasPrefix = false;//default:true
// gi2.hasManyPage = true;//default:false
// gi2.manyPageRegex = "<span class=\"s1 s3\"> </span>(?<np>.*?) </a>";
// gi2.titleRegex = "id=\"h1title\">(?<title>.*?)</h1>";
// gi2.timeRegex = "<span class=\"info\">(?<time>.*?) ";
// gi2.bodyRegex = "class=\"summary\"(?<body>.*?)<!-- -->";
// gi2.hostName = "163";
// gi2.encoding = "GBK";
// gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
// Console.WriteLine(gi2.fileSave);
// gi2.homeOnly = true;
// gi2.threadStart();
//}
//{
// GetItem gi2 = new GetItem();
// gi2.pageUrlsRegex = "\"(?<url>http://www.infzm.com/content/[\\d]+?)\"";
// gi2.prefix = "http://www.infzm.com/";
// gi2.hasPrefix = false;//default:true
// gi2.hasManyPage = false;//default:false
// //gi2.manyPageRegex = "<span class=\"s1 s3\"> </span>(?<np>.*?) </a>";
// gi2.titleRegex = "<div id=\"detailContent\">[\\s]*<h1>[\\s]*(?<title>.*?)[\\s]*</h1>";
// gi2.timeRegex = "<span class=\"pubTime\">(?<time>.*?)</span>";
// gi2.bodyRegex = "<div id=\"content-context\">(?<body>.*?)<!--end #text-->";
// gi2.hostName = "infzm";
// gi2.encoding = "utf-8";
// gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
// Console.WriteLine(gi2.fileSave);
// gi2.homeOnly = true;
// gi2.threadStart();
//}
//Console.Read();
}
}
class GetItem{
public string pageUrl;
public bool homeOnly = false;
public bool hasPrefix = true;
public int pageWantToGet = 1;
public bool hasManyPage = false;
public string manyPageRegex;
public string prefix;
private List<string> pageUrls;
public string pageUrlsRegex;
public string titleRegex;
public string timeRegex;
public string bodyRegex;
public string fileSave;
public string hostName;
public string encoding;
public void threadStart() {
if(!prefix.EndsWith("/"))prefix+="/";
ThreadStart ts = new ThreadStart(start);
Thread th = new Thread(ts);
th.Start();
}
private void start() {
if (homeOnly) {
getPageUrls(-1);
}
else
{
for (int i = 1; i <= pageWantToGet; i++)
getPageUrls(i);
}
startGetAll();
}
private void WriteFile(string str) {
FileStream fs = new FileStream(fileSave, FileMode.Append);
StreamWriter streamWriter = new StreamWriter(fs,System.Text.Encoding.GetEncoding("gb2312"));
streamWriter.WriteLine(str);
streamWriter.Flush();
streamWriter.Close();
fs.Close();
}
private void deleteTag(ref string str)
{
str = Regex.Replace(str, "<[\\s]*p[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*p[\\s]*?>", "\r
");
str = Regex.Replace(str, "<[\\s]*br[\\s]*/[\\s]*[^>]*>?>", "\r
");
str = Regex.Replace(str, "<[\\s]*br[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*br[^>]*>?>", "\r
");
str = Regex.Replace(str, "<[\\s]*a[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*a[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*strong[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*strong[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*div[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*div[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*b[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*b[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*span[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*span[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*script[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*script[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*li[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*li[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*style[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*style[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*i[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*i[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*h3[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*h2[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*h3[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*h2[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*font[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*font[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "<[\\s]*q[\\s]*[^>]*>?>", "");
str = Regex.Replace(str, "</[\\s]*q[\\s]*[^>]*>?>", "");
str = str.Replace("”", "\"");
str = str.Replace("“", "\"");
str = str.Replace("‘", "'");
str = str.Replace("’", "'");
str = str.Replace(" ", " ");
str = str.Replace("…", "…");
str = str.Replace("–", "-");
str = str.Replace("—", "—");
}
public GetItem()
{
//this.homeUrl = url;
pageUrls = new List<string>(50);
}
private string getNextPageContent(string url) {
Console.WriteLine(url);
//Console.Read();
try
{
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
//req.SendChunked = true;
req.Method = "get";
req.ContentType = "text/html;charset=utf-8";
//req.AllowAutoRedirect = false;
// req.Timeout = 50;
//req.CookieContainer = cc;
StringBuilder sb = new StringBuilder("");
StringBuilder cont = new StringBuilder("");
using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
{
System.IO.Stream respStream = wr.GetResponseStream();
System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
//Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);
//Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);
Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);
do
{
sb.Append(reader.ReadLine());
} while (!reader.EndOfStream);
string str = sb.ToString();
//Console.WriteLine(sb);
//Match m = titler.Match(str);
//if (m.Success)
//{
// Console.WriteLine("title:{0}", m.Groups["title"].Value);
// //streamWriter.WriteLine(m.Groups["title"].Value);
// cont.AppendLine(m.Groups["title"].Value);
//}
//cont.AppendLine(url);
//m = timer.Match(str);
//if (m.Success)
//{
// Console.WriteLine("time:{0}", m.Groups["time"].Value);
// cont.AppendLine(m.Groups["time"].Value);
//}
Match m = bodyr.Match(str);
if (m.Success)
{
string body = m.Groups["body"].Value;
deleteTag(ref body);
Console.WriteLine(" ");
return body;
}
}
}
catch (Exception ex)
{
Console.WriteLine(" :{0}",ex.Message);
return "";
}
return "";
}
private void getContent(string url,int index,int total)
{
Console.WriteLine(url);
//Console.Read();
try
{
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
req.Method = "get";
req.ContentType = " text/html;charset=utf-8";
//req.
//req.AllowAutoRedirect = false;
// req.Timeout = 50;
//req.CookieContainer = cc;
StringBuilder sb = new StringBuilder("");
StringBuilder cont = new StringBuilder("");
using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
{
System.IO.Stream respStream = wr.GetResponseStream();
System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);
Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);
Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);
do
{
sb.Append(reader.ReadLine());
} while (!reader.EndOfStream);
string str = sb.ToString();
//Console.WriteLine(sb);
Match m = titler.Match(str);
if (m.Success) {
Console.WriteLine("title:{0}",m.Groups["title"].Value);
//streamWriter.WriteLine(m.Groups["title"].Value);
cont.AppendLine(m.Groups["title"].Value);
}
cont.AppendLine(string.Format("({0}/{1}){2}",index,total,url));
m = timer.Match(str);
if (m.Success) {
Console.WriteLine("time:{0}", m.Groups["time"].Value);
cont.AppendLine(m.Groups["time"].Value);
}
m = bodyr.Match(str);
if (m.Success)
{
string body = m.Groups["body"].Value;
deleteTag(ref body);
Console.WriteLine(" ");
cont.AppendLine(body);
}
if (hasManyPage) {
Regex mr = new Regex(this.manyPageRegex, RegexOptions.Singleline);
Match mm = mr.Match(str);
if (mm.Success) {
Console.WriteLine(" ..");
string pagesurl = mm.Groups["np"].Value;
Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);
MatchCollection mc = r.Matches(pagesurl);
for (int i = 0; i < mc.Count; i++) {
string u = mc[i].Groups["url"].Value;
if (pageUrls.IndexOf(u) == -1) {
pageUrls.Add(u);
cont.AppendLine(getNextPageContent(u));
}
}
}
}
cont.AppendLine("--------------------------------------------------------------");
WriteFile(cont.ToString());
}
}
catch (Exception ex)
{
Console.WriteLine(" :{0},{1}",ex.Source,ex.Message);
return;
}
}
private void startGetAll() {
for (int i = 0; i < pageUrls.Count; i++)
{
string u;
if (hasPrefix)
{
if (pageUrls[i].StartsWith("/"))
u = string.Format("{0}{1}", prefix, pageUrls[i].Substring(1));
else u = string.Format("{0}{1}", prefix, pageUrls[i]);
}
else u = pageUrls[i];
getContent(u, i, pageUrls.Count);
}
}
private void getPageUrls(int pageIndex)
{
string url;
if (pageIndex == -1) url = prefix;
else url = string.Format("{0}{1}",this.pageUrl,pageIndex);
Console.WriteLine(url);
try
{
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
req.Method = "get";
req.ContentType = " text/html;charset=utf-8";
//req.AllowAutoRedirect = false;
// req.Timeout = 50;
//req.CookieContainer = cc;
StringBuilder sb = new StringBuilder("");
using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
{
System.IO.Stream respStream = wr.GetResponseStream();
System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);
do
{
sb.Append(reader.ReadLine());
} while (!reader.EndOfStream);
// Console.WriteLine(sb);
MatchCollection m = r.Matches(sb.ToString());
//Console.WriteLine("regex:{0},matches:{1}", this.pageUrlsRegex, m.Count);
for (int i = 0; i < m.Count; i++) {
string temp = m[i].Groups["url"].Value;
//Console.WriteLine("index:{0},{1}", pageUrls.IndexOf(temp), temp);
if (pageUrls.IndexOf(temp) == -1) pageUrls.Add(temp);
}
Console.WriteLine("{0}:{1} articles.",this.hostName,pageUrls.Count);
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
Console.WriteLine("{0} end!", this.hostName);
return;
}
Console.WriteLine("{0} end!", this.hostName);
}
}
}
참고: 프로젝트->add item->new xml file:app.config
like this:
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
Java 텍스트 파일(.txt .csv)로부터의 입력 java.io.FileReader➊FileReader fr = new FileReader("파일 이름"); ➋Buffered br = BufferedReader(br); ➌String rec; rec = br.readLine(); ➍ fr.clos...
텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.