0
0

HtmlHelper.cs 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. using System;
  2. using System.Text;
  3. using System.Net;
  4. using System.IO;
  5. using System.Threading;
  6. using System.Text.RegularExpressions;
  7. using Admin.Core.Common.Attributes;
  8. namespace Admin.Core.Common.Helpers
  9. {
  10. /// <summary>
  11. /// Html操作相关类
  12. /// </summary>
  13. [SingleInstance]
  14. public class HtmlHelper
  15. {
  16. #region 私有字段
  17. private readonly string _ContentType = "application/json";
  18. private readonly string _Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
  19. private readonly string _UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
  20. private int _Delay = 1000;
  21. private int _CurrentTry = 0;
  22. #endregion
  23. #region 公有属性
  24. /// <summary>
  25. /// Cookie
  26. /// </summary>
  27. public CookieContainer CookieContainer { get; } = new CookieContainer();
  28. /// <summary>
  29. /// 语言
  30. /// </summary>
  31. public Encoding Encoding { get; set; } = Encoding.GetEncoding("utf-8");
  32. public int NetworkDelay
  33. {
  34. get
  35. {
  36. Random r = new Random();
  37. return r.Next(_Delay, _Delay * 2);
  38. }
  39. set
  40. {
  41. _Delay = value;
  42. }
  43. }
  44. public int MaxTry { get; set; } = 300;
  45. #endregion
  46. #region 获取HTML
  47. /// <summary>
  48. /// 获取HTML
  49. /// </summary>
  50. /// <param name="url">地址</param>
  51. /// <param name="postData">post 提交的字符串</param>
  52. /// <param name="isPost">是否是post</param>
  53. /// <param name="cookieContainer">CookieContainer</param>
  54. public string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
  55. {
  56. if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer);
  57. Thread.Sleep(NetworkDelay);
  58. _CurrentTry++;
  59. HttpWebRequest httpWebRequest = null;
  60. HttpWebResponse httpWebResponse = null;
  61. try
  62. {
  63. byte[] byteRequest = Encoding.Default.GetBytes(postData);
  64. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  65. httpWebRequest.CookieContainer = cookieContainer;
  66. httpWebRequest.ContentType = _ContentType;
  67. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  68. httpWebRequest.Referer = url;
  69. httpWebRequest.Accept = _Accept;
  70. httpWebRequest.UserAgent = _UserAgent;
  71. httpWebRequest.Method = isPost ? "POST" : "GET";
  72. httpWebRequest.ContentLength = byteRequest.Length;
  73. Stream stream = httpWebRequest.GetRequestStream();
  74. stream.Write(byteRequest, 0, byteRequest.Length);
  75. stream.Close();
  76. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  77. Stream responseStream = httpWebResponse.GetResponseStream();
  78. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  79. string html = streamReader.ReadToEnd();
  80. streamReader.Close();
  81. responseStream.Close();
  82. _CurrentTry = 0;
  83. httpWebRequest.Abort();
  84. httpWebResponse.Close();
  85. return html;
  86. }
  87. catch
  88. {
  89. if (_CurrentTry <= MaxTry) GetHtml(url, postData, isPost, cookieContainer);
  90. _CurrentTry--;
  91. if (httpWebRequest != null) httpWebRequest.Abort();
  92. if (httpWebResponse != null) httpWebResponse.Close();
  93. return string.Empty;
  94. }
  95. }
  96. /// <summary>
  97. /// 获取HTML
  98. /// </summary>
  99. /// <param name="url">地址</param>
  100. /// <param name="cookieContainer">CookieContainer</param>
  101. public string GetHtml(string url, CookieContainer cookieContainer)
  102. {
  103. Thread.Sleep(NetworkDelay);
  104. _CurrentTry++;
  105. HttpWebRequest httpWebRequest = null;
  106. HttpWebResponse httpWebResponse = null;
  107. try
  108. {
  109. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  110. httpWebRequest.CookieContainer = cookieContainer;
  111. httpWebRequest.ContentType = _ContentType;
  112. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  113. httpWebRequest.Referer = url;
  114. httpWebRequest.Accept = _Accept;
  115. httpWebRequest.UserAgent = _UserAgent;
  116. httpWebRequest.Method = "GET";
  117. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  118. Stream responseStream = httpWebResponse.GetResponseStream();
  119. StreamReader streamReader = new StreamReader(responseStream, Encoding);
  120. string html = streamReader.ReadToEnd();
  121. streamReader.Close();
  122. responseStream.Close();
  123. _CurrentTry--;
  124. httpWebRequest.Abort();
  125. httpWebResponse.Close();
  126. return html;
  127. }
  128. catch (Exception)
  129. {
  130. if (_CurrentTry <= MaxTry) GetHtml(url, cookieContainer);
  131. _CurrentTry--;
  132. if (httpWebRequest != null) httpWebRequest.Abort();
  133. if (httpWebResponse != null) httpWebResponse.Close();
  134. return string.Empty;
  135. }
  136. }
  137. #endregion
  138. #region 获取字符流
  139. //---------------------------------------------------------------------------------------------------------------
  140. // 示例:
  141. // System.Net.CookieContainer cookie = new System.Net.CookieContainer();
  142. // Stream s = HttpHelper.GetStream("http://ptlogin2.qq.com/getimage?aid=15000102&0.43878429697395826", cookie);
  143. // picVerify.Image = Image.FromStream(s);
  144. //---------------------------------------------------------------------------------------------------------------
  145. /// <summary>
  146. /// 获取字符流
  147. /// </summary>
  148. /// <param name="url">地址</param>
  149. /// <param name="cookieContainer">cookieContainer</param>
  150. public Stream GetStream(string url, CookieContainer cookieContainer)
  151. {
  152. _CurrentTry++;
  153. HttpWebRequest httpWebRequest = null;
  154. HttpWebResponse httpWebResponse = null;
  155. try
  156. {
  157. httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
  158. httpWebRequest.CookieContainer = cookieContainer;
  159. httpWebRequest.ContentType = _ContentType;
  160. httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;
  161. httpWebRequest.Referer = url;
  162. httpWebRequest.Accept = _Accept;
  163. httpWebRequest.UserAgent = _UserAgent;
  164. httpWebRequest.Method = "GET";
  165. httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
  166. Stream responseStream = httpWebResponse.GetResponseStream();
  167. _CurrentTry--;
  168. return responseStream;
  169. }
  170. catch (Exception)
  171. {
  172. if (_CurrentTry <= MaxTry)
  173. {
  174. GetHtml(url, cookieContainer);
  175. }
  176. _CurrentTry--;
  177. if (httpWebRequest != null)
  178. {
  179. httpWebRequest.Abort();
  180. } if (httpWebResponse != null)
  181. {
  182. httpWebResponse.Close();
  183. }
  184. return null;
  185. }
  186. }
  187. #endregion
  188. #region 清除HTML标记
  189. /// <summary>
  190. /// 清除HTML标记
  191. /// </summary>
  192. /// <param name="Htmlstring"></param>
  193. /// <returns>已经去除后的文字</returns>
  194. public string NoHTML(string Htmlstring)
  195. {
  196. //删除脚本
  197. Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
  198. //删除HTML
  199. Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
  200. Htmlstring = regex.Replace(Htmlstring, "");
  201. Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
  202. Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
  203. Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
  204. Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
  205. Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
  206. Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
  207. Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
  208. Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
  209. Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
  210. Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
  211. Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
  212. Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
  213. Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
  214. Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
  215. Htmlstring.Replace("<", "");
  216. Htmlstring.Replace(">", "");
  217. Htmlstring.Replace("\r\n", "");
  218. return Htmlstring;
  219. }
  220. #endregion
  221. #region 删除文本中带的HTML标记
  222. /// <summary>
  223. /// 删除文本中带的HTML标记
  224. /// </summary>
  225. /// <param name="InString">输入要删除带HTML的字符串</param>
  226. /// <returns>返回处理过的字符串</returns>
  227. public string DelHtmlCode(string InString)
  228. {
  229. string strTemp = InString;
  230. while (strTemp.Contains("<"))
  231. {
  232. if (!strTemp.Contains(">")) { break; } //当字符串内不包含">"时退出循环
  233. int htmlBeginNum = strTemp.IndexOf("<");
  234. int htmlEndNum = strTemp.IndexOf(">");
  235. //删除从"<"到">"之间的所有字符串
  236. strTemp = strTemp.Remove(htmlBeginNum, htmlEndNum - htmlBeginNum + 1);
  237. }
  238. strTemp = strTemp.Replace("\n", "");
  239. strTemp = strTemp.Replace("\r", "");
  240. strTemp = strTemp.Replace("\n\r", "");
  241. strTemp = strTemp.Replace("&nbsp;", "");
  242. strTemp = strTemp.Replace(" ", "");
  243. strTemp = strTemp.Trim();
  244. return strTemp;
  245. }
  246. #endregion
  247. #region 匹配页面的链接
  248. /// <summary>
  249. /// 获取页面的链接正则
  250. /// </summary>
  251. public string GetHref(string HtmlCode)
  252. {
  253. string MatchVale = "";
  254. string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
  255. foreach (Match m in Regex.Matches(HtmlCode, Reg))
  256. {
  257. MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
  258. }
  259. return MatchVale;
  260. }
  261. #endregion
  262. #region 匹配页面的图片地址
  263. /// <summary>
  264. /// 匹配页面的图片地址
  265. /// </summary>
  266. /// <param name="HtmlCode"></param>
  267. /// <param name="imgHttp">要补充的http://路径信息</param>
  268. /// <returns></returns>
  269. public string GetImgSrc(string HtmlCode, string imgHttp)
  270. {
  271. string MatchVale = "";
  272. string Reg = @"<img.+?>";
  273. foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
  274. {
  275. MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
  276. }
  277. return MatchVale;
  278. }
  279. /// <summary>
  280. /// 匹配<img src="" />中的图片路径实际链接
  281. /// </summary>
  282. /// <param name="ImgString"><img src="" />字符串</param>
  283. /// <param name="imgHttp"></param>
  284. /// <returns></returns>
  285. public string GetImg(string ImgString, string imgHttp)
  286. {
  287. string MatchVale = "";
  288. string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
  289. foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
  290. {
  291. MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");
  292. }
  293. if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
  294. return (MatchVale);
  295. else
  296. return (imgHttp + MatchVale);
  297. }
  298. #endregion
  299. #region 抓取远程页面内容
  300. /// <summary>
  301. /// 以GET方式抓取远程页面内容
  302. /// </summary>
  303. public string Get_Http(string tUrl)
  304. {
  305. string strResult;
  306. try
  307. {
  308. HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(tUrl);
  309. hwr.Timeout = 19600;
  310. HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
  311. Stream myStream = hwrs.GetResponseStream();
  312. StreamReader sr = new StreamReader(myStream, Encoding.Default);
  313. StringBuilder sb = new StringBuilder();
  314. while (-1 != sr.Peek())
  315. {
  316. sb.Append(sr.ReadLine() + "\r\n");
  317. }
  318. strResult = sb.ToString();
  319. hwrs.Close();
  320. }
  321. catch (Exception ee)
  322. {
  323. strResult = ee.Message;
  324. }
  325. return strResult;
  326. }
  327. /// <summary>
  328. /// 以POST方式抓取远程页面内容
  329. /// </summary>
  330. /// <param name="url"></param>
  331. /// <param name="postData">参数列表</param>
  332. /// <param name="encodeType"></param>
  333. /// <returns></returns>
  334. public string Post_Http(string url, string postData, string encodeType)
  335. {
  336. string strResult;
  337. try
  338. {
  339. Encoding encoding = Encoding.GetEncoding(encodeType);
  340. byte[] POST = encoding.GetBytes(postData);
  341. HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
  342. myRequest.Method = "POST";
  343. myRequest.ContentType = "application/x-www-form-urlencoded";
  344. myRequest.ContentLength = POST.Length;
  345. Stream newStream = myRequest.GetRequestStream();
  346. newStream.Write(POST, 0, POST.Length); //设置POST
  347. newStream.Close();
  348. HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
  349. StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
  350. strResult = reader.ReadToEnd();
  351. }
  352. catch (Exception ex)
  353. {
  354. strResult = ex.Message;
  355. }
  356. return strResult;
  357. }
  358. #endregion
  359. #region 压缩HTML输出
  360. /// <summary>
  361. /// 压缩HTML输出
  362. /// </summary>
  363. public string ZipHtml(string Html)
  364. {
  365. Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
  366. Html = Regex.Replace(Html, @"\r\n\s*", "");
  367. Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
  368. return Html;
  369. }
  370. #endregion
  371. #region 过滤指定HTML标签
  372. /// <summary>
  373. /// 过滤指定HTML标签
  374. /// </summary>
  375. /// <param name="s_TextStr">要过滤的字符</param>
  376. /// <param name="html_Str">a img p div</param>
  377. public string DelHtml(string s_TextStr, string html_Str)
  378. {
  379. string rStr = "";
  380. if (!string.IsNullOrEmpty(s_TextStr))
  381. {
  382. rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase);
  383. rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase);
  384. }
  385. return rStr;
  386. }
  387. #endregion
  388. }
  389. }