HtmlHelper.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. using Admin.Core.Common.Attributes;
  2. using System;
  3. using System.IO;
  4. using System.Net;
  5. using System.Text;
  6. using System.Text.RegularExpressions;
  7. using System.Threading;
  8. namespace Admin.Core.Common.Helpers
  9. {
  10. /// <summary>
  11. /// Html操作相关类
  12. /// </summary>
  13. [SingleInstance]
  14. public class HtmlHelper
  15. {
  16. #region 私有字段
  17. private readonly string _ContentType = "application/json";
  18. private readonly string _Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
  19. private readonly string _UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
  20. private int _Delay = 1000;
  21. private int _CurrentTry = 0;
  22. #endregion 私有字段
  23. #region 公有属性
  24. /// <summary>
  25. /// Cookie
  26. /// </summary>
  27. public CookieContainer CookieContainer { get; } = new CookieContainer();
  28. /// <summary>
  29. /// 语言
  30. /// </summary>
  31. public Encoding Encoding { get; set; } = Encoding.GetEncoding("utf-8");
  32. public int NetworkDelay
  33. {
  34. get
  35. {
  36. Random r = new Random();
  37. return r.Next(_Delay, _Delay * 2);
  38. }
  39. set
  40. {
  41. _Delay = value;
  42. }
  43. }
  44. public int MaxTry { get; set; } = 300;
  45. #endregion 公有属性
  46. #region 获取HTML
  47. /// <summary>
  48. /// 获取HTML
  49. /// </summary>
  50. /// <param name="url">地址</param>
  51. /// <param name="postData">post 提交的字符串</param>
  52. /// <param name="isPost">是否是post</param>
  53. /// <param name="cookieContainer">CookieContainer</param>
  54. public string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
  55. {
  56. if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer);
  57. Thread.Sleep(NetworkDelay);
  58. _CurrentTry++;
  59. HttpWebRequest httpWebRequest = null;
  60. HttpWebResponse httpWebResponse = null;
  61. try
  62. {
  63. byte[] byteRequest = Encoding.Default.GetBytes(postData);
  64. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  65. httpWebRequest.CookieContainer = cookieContainer;
  66. httpWebRequest.ContentType = _ContentType;
  67. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  68. httpWebRequest.Referer = url;
  69. httpWebRequest.Accept = _Accept;
  70. httpWebRequest.UserAgent = _UserAgent;
  71. httpWebRequest.Method = isPost ? "POST" : "GET";
  72. httpWebRequest.ContentLength = byteRequest.Length;
  73. Stream stream = httpWebRequest.GetRequestStream();
  74. stream.Write(byteRequest, 0, byteRequest.Length);
  75. stream.Close();
  76. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  77. Stream responseStream = httpWebResponse.GetResponseStream();
  78. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  79. string html = streamReader.ReadToEnd();
  80. streamReader.Close();
  81. responseStream.Close();
  82. _CurrentTry = 0;
  83. httpWebRequest.Abort();
  84. httpWebResponse.Close();
  85. return html;
  86. }
  87. catch
  88. {
  89. if (_CurrentTry <= MaxTry) GetHtml(url, postData, isPost, cookieContainer);
  90. _CurrentTry--;
  91. if (httpWebRequest != null) httpWebRequest.Abort();
  92. if (httpWebResponse != null) httpWebResponse.Close();
  93. return string.Empty;
  94. }
  95. }
  96. /// <summary>
  97. /// 获取HTML
  98. /// </summary>
  99. /// <param name="url">地址</param>
  100. /// <param name="cookieContainer">CookieContainer</param>
  101. public string GetHtml(string url, CookieContainer cookieContainer)
  102. {
  103. Thread.Sleep(NetworkDelay);
  104. _CurrentTry++;
  105. HttpWebRequest httpWebRequest = null;
  106. HttpWebResponse httpWebResponse = null;
  107. try
  108. {
  109. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  110. httpWebRequest.CookieContainer = cookieContainer;
  111. httpWebRequest.ContentType = _ContentType;
  112. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  113. httpWebRequest.Referer = url;
  114. httpWebRequest.Accept = _Accept;
  115. httpWebRequest.UserAgent = _UserAgent;
  116. httpWebRequest.Method = "GET";
  117. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  118. Stream responseStream = httpWebResponse.GetResponseStream();
  119. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  120. string html = streamReader.ReadToEnd();
  121. streamReader.Close();
  122. responseStream.Close();
  123. _CurrentTry--;
  124. httpWebRequest.Abort();
  125. httpWebResponse.Close();
  126. return html;
  127. }
  128. catch (Exception)
  129. {
  130. if (_CurrentTry <= MaxTry) GetHtml(url, cookieContainer);
  131. _CurrentTry--;
  132. if (httpWebRequest != null) httpWebRequest.Abort();
  133. if (httpWebResponse != null) httpWebResponse.Close();
  134. return string.Empty;
  135. }
  136. }
  137. #endregion 获取HTML
  138. #region 获取字符流
  139. //---------------------------------------------------------------------------------------------------------------
  140. // 示例:
  141. // System.Net.CookieContainer cookie = new System.Net.CookieContainer();
  142. // Stream s = HttpHelper.GetStream("http://ptlogin2.qq.com/getimage?aid=15000102&0.43878429697395826", cookie);
  143. // picVerify.Image = Image.FromStream(s);
  144. //---------------------------------------------------------------------------------------------------------------
  145. /// <summary>
  146. /// 获取字符流
  147. /// </summary>
  148. /// <param name="url">地址</param>
  149. /// <param name="cookieContainer">cookieContainer</param>
  150. public Stream GetStream(string url, CookieContainer cookieContainer)
  151. {
  152. _CurrentTry++;
  153. HttpWebRequest httpWebRequest = null;
  154. HttpWebResponse httpWebResponse = null;
  155. try
  156. {
  157. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  158. httpWebRequest.CookieContainer = cookieContainer;
  159. httpWebRequest.ContentType = _ContentType;
  160. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  161. httpWebRequest.Referer = url;
  162. httpWebRequest.Accept = _Accept;
  163. httpWebRequest.UserAgent = _UserAgent;
  164. httpWebRequest.Method = "GET";
  165. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  166. Stream responseStream = httpWebResponse.GetResponseStream();
  167. _CurrentTry--;
  168. return responseStream;
  169. }
  170. catch (Exception)
  171. {
  172. if (_CurrentTry <= MaxTry)
  173. {
  174. GetHtml(url, cookieContainer);
  175. }
  176. _CurrentTry--;
  177. if (httpWebRequest != null)
  178. {
  179. httpWebRequest.Abort();
  180. }
  181. if (httpWebResponse != null)
  182. {
  183. httpWebResponse.Close();
  184. }
  185. return null;
  186. }
  187. }
  188. #endregion 获取字符流
  189. #region 清除HTML标记
  190. /// <summary>
  191. /// 清除HTML标记
  192. /// </summary>
  193. /// <param name="Htmlstring"></param>
  194. /// <returns>已经去除后的文字</returns>
  195. public string NoHTML(string Htmlstring)
  196. {
  197. //删除脚本
  198. Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
  199. //删除HTML
  200. Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
  201. Htmlstring = regex.Replace(Htmlstring, "");
  202. Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
  203. Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
  204. Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
  205. Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
  206. Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
  207. Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
  208. Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
  209. Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
  210. Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
  211. Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
  212. Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
  213. Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
  214. Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
  215. Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
  216. Htmlstring.Replace("<", "");
  217. Htmlstring.Replace(">", "");
  218. Htmlstring.Replace("\r\n", "");
  219. return Htmlstring;
  220. }
  221. #endregion 清除HTML标记
  222. #region 删除文本中带的HTML标记
  223. /// <summary>
  224. /// 删除文本中带的HTML标记
  225. /// </summary>
  226. /// <param name="InString">输入要删除带HTML的字符串</param>
  227. /// <returns>返回处理过的字符串</returns>
  228. public string DelHtmlCode(string InString)
  229. {
  230. string strTemp = InString;
  231. while (strTemp.Contains("<"))
  232. {
  233. if (!strTemp.Contains(">")) { break; } //当字符串内不包含">"时退出循环
  234. int htmlBeginNum = strTemp.IndexOf("<");
  235. int htmlEndNum = strTemp.IndexOf(">");
  236. //删除从"<"到">"之间的所有字符串
  237. strTemp = strTemp.Remove(htmlBeginNum, htmlEndNum - htmlBeginNum + 1);
  238. }
  239. strTemp = strTemp.Replace("\n", "");
  240. strTemp = strTemp.Replace("\r", "");
  241. strTemp = strTemp.Replace("\n\r", "");
  242. strTemp = strTemp.Replace("&nbsp;", "");
  243. strTemp = strTemp.Replace(" ", "");
  244. strTemp = strTemp.Trim();
  245. return strTemp;
  246. }
  247. #endregion 删除文本中带的HTML标记
  248. #region 匹配页面的链接
  249. /// <summary>
  250. /// 获取页面的链接正则
  251. /// </summary>
  252. public string GetHref(string HtmlCode)
  253. {
  254. string MatchVale = "";
  255. string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
  256. foreach (Match m in Regex.Matches(HtmlCode, Reg))
  257. {
  258. MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
  259. }
  260. return MatchVale;
  261. }
  262. #endregion 匹配页面的链接
  263. #region 匹配页面的图片地址
  264. /// <summary>
  265. /// 匹配页面的图片地址
  266. /// </summary>
  267. /// <param name="HtmlCode"></param>
  268. /// <param name="imgHttp">要补充的http://路径信息</param>
  269. /// <returns></returns>
  270. public string GetImgSrc(string HtmlCode, string imgHttp)
  271. {
  272. string MatchVale = "";
  273. string Reg = @"<img.+?>";
  274. foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
  275. {
  276. MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
  277. }
  278. return MatchVale;
  279. }
  280. /// <summary>
  281. /// 匹配<img src="" />中的图片路径实际链接
  282. /// </summary>
  283. /// <param name="ImgString"><img src="" />字符串</param>
  284. /// <param name="imgHttp"></param>
  285. /// <returns></returns>
  286. public string GetImg(string ImgString, string imgHttp)
  287. {
  288. string MatchVale = "";
  289. string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
  290. foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
  291. {
  292. MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");
  293. }
  294. if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
  295. return (MatchVale);
  296. else
  297. return (imgHttp + MatchVale);
  298. }
  299. #endregion 匹配页面的图片地址
  300. #region 抓取远程页面内容
  301. /// <summary>
  302. /// 以GET方式抓取远程页面内容
  303. /// </summary>
  304. public string Get_Http(string tUrl)
  305. {
  306. string strResult;
  307. try
  308. {
  309. HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(tUrl);
  310. hwr.Timeout = 19600;
  311. HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
  312. Stream myStream = hwrs.GetResponseStream();
  313. StreamReader sr = new StreamReader(myStream, Encoding.Default);
  314. StringBuilder sb = new StringBuilder();
  315. while (-1 != sr.Peek())
  316. {
  317. sb.Append(sr.ReadLine() + "\r\n");
  318. }
  319. strResult = sb.ToString();
  320. hwrs.Close();
  321. }
  322. catch (Exception ee)
  323. {
  324. strResult = ee.Message;
  325. }
  326. return strResult;
  327. }
  328. /// <summary>
  329. /// 以POST方式抓取远程页面内容
  330. /// </summary>
  331. /// <param name="url"></param>
  332. /// <param name="postData">参数列表</param>
  333. /// <param name="encodeType"></param>
  334. /// <returns></returns>
  335. public string Post_Http(string url, string postData, string encodeType)
  336. {
  337. string strResult;
  338. try
  339. {
  340. Encoding encoding = Encoding.GetEncoding(encodeType);
  341. byte[] POST = encoding.GetBytes(postData);
  342. HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
  343. myRequest.Method = "POST";
  344. myRequest.ContentType = "application/x-www-form-urlencoded";
  345. myRequest.ContentLength = POST.Length;
  346. Stream newStream = myRequest.GetRequestStream();
  347. newStream.Write(POST, 0, POST.Length); //设置POST
  348. newStream.Close();
  349. HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
  350. StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
  351. strResult = reader.ReadToEnd();
  352. }
  353. catch (Exception ex)
  354. {
  355. strResult = ex.Message;
  356. }
  357. return strResult;
  358. }
  359. #endregion 抓取远程页面内容
  360. #region 压缩HTML输出
  361. /// <summary>
  362. /// 压缩HTML输出
  363. /// </summary>
  364. public string ZipHtml(string Html)
  365. {
  366. Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
  367. Html = Regex.Replace(Html, @"\r\n\s*", "");
  368. Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
  369. return Html;
  370. }
  371. #endregion 压缩HTML输出
  372. #region 过滤指定HTML标签
  373. /// <summary>
  374. /// 过滤指定HTML标签
  375. /// </summary>
  376. /// <param name="s_TextStr">要过滤的字符</param>
  377. /// <param name="html_Str">a img p div</param>
  378. public string DelHtml(string s_TextStr, string html_Str)
  379. {
  380. string rStr = "";
  381. if (!string.IsNullOrEmpty(s_TextStr))
  382. {
  383. rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase);
  384. rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase);
  385. }
  386. return rStr;
  387. }
  388. #endregion 过滤指定HTML标签
  389. }
  390. }