| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439 | using Admin.Core.Common.Attributes;using System;using System.IO;using System.Net;using System.Text;using System.Text.RegularExpressions;using System.Threading;namespace Admin.Core.Common.Helpers{    /// <summary>    /// Html操作相关类    /// </summary>    [SingleInstance]    public class HtmlHelper    {        #region 私有字段        private readonly string _ContentType = "application/json";        private readonly string _Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";        private readonly string _UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";        private int _Delay = 1000;        private int _CurrentTry = 0;        #endregion 私有字段        #region 公有属性        /// <summary>        /// Cookie        /// </summary>        public CookieContainer CookieContainer { get; } = new CookieContainer();        /// <summary>        /// 语言        /// </summary>        public Encoding Encoding { get; set; } = Encoding.GetEncoding("utf-8");        public int NetworkDelay        {            get            {                Random r = new Random();                return r.Next(_Delay, _Delay * 2);            }            set            {                _Delay = value;            }        }        public int MaxTry { get; set; } = 300;        #endregion 公有属性        #region 获取HTML        /// <summary>        /// 获取HTML        /// </summary>        /// <param name="url">地址</param>        /// <param name="postData">post 提交的字符串</param>        /// <param name="isPost">是否是post</param>        /// <param name="cookieContainer">CookieContainer</param>        public string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)        {            if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer);            Thread.Sleep(NetworkDelay);            _CurrentTry++;            HttpWebRequest httpWebRequest = null;            HttpWebResponse httpWebResponse = null;            try            {                byte[] byteRequest = Encoding.Default.GetBytes(postData);                httpWebRequest = (HttpWebRequest)WebRequest.Create(url);                httpWebRequest.CookieContainer = cookieContainer;                httpWebRequest.ContentType = _ContentType;                httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;                httpWebRequest.Referer = url;                httpWebRequest.Accept = _Accept;                httpWebRequest.UserAgent = _UserAgent;                httpWebRequest.Method = isPost ? "POST" : "GET";                httpWebRequest.ContentLength = byteRequest.Length;                Stream stream = httpWebRequest.GetRequestStream();                stream.Write(byteRequest, 0, byteRequest.Length);                stream.Close();                httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();                Stream responseStream = httpWebResponse.GetResponseStream();                StreamReader streamReader = new StreamReader(responseStream, Encoding);                string html = streamReader.ReadToEnd();                streamReader.Close();                responseStream.Close();                _CurrentTry = 0;                httpWebRequest.Abort();                httpWebResponse.Close();                return html;            }            catch            {                if (_CurrentTry <= MaxTry) GetHtml(url, postData, isPost, cookieContainer);                _CurrentTry--;                if (httpWebRequest != null) httpWebRequest.Abort();                if (httpWebResponse != null) httpWebResponse.Close();                return string.Empty;            }        }        /// <summary>        /// 获取HTML        /// </summary>        /// <param name="url">地址</param>        /// <param name="cookieContainer">CookieContainer</param>        public string GetHtml(string url, CookieContainer cookieContainer)        {            Thread.Sleep(NetworkDelay);            _CurrentTry++;            HttpWebRequest httpWebRequest = null;            HttpWebResponse httpWebResponse = null;            try            {                httpWebRequest = (HttpWebRequest)WebRequest.Create(url);                httpWebRequest.CookieContainer = cookieContainer;                httpWebRequest.ContentType = _ContentType;                httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;                httpWebRequest.Referer = url;                httpWebRequest.Accept = _Accept;                httpWebRequest.UserAgent = _UserAgent;                httpWebRequest.Method = "GET";                httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();                Stream responseStream = httpWebResponse.GetResponseStream();                StreamReader streamReader = new StreamReader(responseStream, Encoding);                string html = streamReader.ReadToEnd();                streamReader.Close();                responseStream.Close();                _CurrentTry--;                httpWebRequest.Abort();                httpWebResponse.Close();                return html;            }            catch (Exception)            {                if (_CurrentTry <= MaxTry) GetHtml(url, cookieContainer);                _CurrentTry--;                if (httpWebRequest != null) httpWebRequest.Abort();                if (httpWebResponse != null) httpWebResponse.Close();                return string.Empty;            }        }        #endregion 获取HTML        #region 获取字符流        //---------------------------------------------------------------------------------------------------------------        // 示例:        // System.Net.CookieContainer cookie = new System.Net.CookieContainer();        // Stream s = HttpHelper.GetStream("http://ptlogin2.qq.com/getimage?aid=15000102&0.43878429697395826", cookie);        // picVerify.Image = Image.FromStream(s);        //---------------------------------------------------------------------------------------------------------------        /// <summary>        /// 获取字符流        /// </summary>        /// <param name="url">地址</param>        /// <param name="cookieContainer">cookieContainer</param>        public Stream GetStream(string url, CookieContainer cookieContainer)        {            _CurrentTry++;            HttpWebRequest httpWebRequest = null;            HttpWebResponse httpWebResponse = null;            try            {                httpWebRequest = (HttpWebRequest)WebRequest.Create(url);                httpWebRequest.CookieContainer = cookieContainer;                httpWebRequest.ContentType = _ContentType;                httpWebRequest.ServicePoint.ConnectionLimit = MaxTry;                httpWebRequest.Referer = url;                httpWebRequest.Accept = _Accept;                httpWebRequest.UserAgent = _UserAgent;                httpWebRequest.Method = "GET";                httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();                Stream responseStream = httpWebResponse.GetResponseStream();                _CurrentTry--;                return responseStream;            }            catch (Exception)            {                if (_CurrentTry <= MaxTry)                {                    GetHtml(url, cookieContainer);                }                _CurrentTry--;                if (httpWebRequest != null)                {                    httpWebRequest.Abort();                }                if (httpWebResponse != null)                {                    httpWebResponse.Close();                }                return null;            }        }        #endregion 获取字符流        #region 清除HTML标记        /// <summary>        /// 清除HTML标记        /// </summary>        /// <param name="Htmlstring"></param>        /// <returns>已经去除后的文字</returns>        public string NoHTML(string Htmlstring)        {            //删除脚本            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);            //删除HTML            Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);            Htmlstring = regex.Replace(Htmlstring, "");            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);            Htmlstring.Replace("<", "");            Htmlstring.Replace(">", "");            Htmlstring.Replace("\r\n", "");            return Htmlstring;        }        #endregion 清除HTML标记        #region 删除文本中带的HTML标记        /// <summary>        /// 删除文本中带的HTML标记        /// </summary>        /// <param name="InString">输入要删除带HTML的字符串</param>        /// <returns>返回处理过的字符串</returns>        public string DelHtmlCode(string InString)        {            string strTemp = InString;            while (strTemp.Contains("<"))            {                if (!strTemp.Contains(">")) { break; }    //当字符串内不包含">"时退出循环                int htmlBeginNum = strTemp.IndexOf("<");                int htmlEndNum = strTemp.IndexOf(">");                //删除从"<"到">"之间的所有字符串                strTemp = strTemp.Remove(htmlBeginNum, htmlEndNum - htmlBeginNum + 1);            }            strTemp = strTemp.Replace("\n", "");            strTemp = strTemp.Replace("\r", "");            strTemp = strTemp.Replace("\n\r", "");            strTemp = strTemp.Replace(" ", "");            strTemp = strTemp.Replace(" ", "");            strTemp = strTemp.Trim();            return strTemp;        }        #endregion 删除文本中带的HTML标记        #region 匹配页面的链接        /// <summary>        /// 获取页面的链接正则        /// </summary>        public string GetHref(string HtmlCode)        {            string MatchVale = "";            string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";            foreach (Match m in Regex.Matches(HtmlCode, Reg))            {                MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";            }            return MatchVale;        }        #endregion 匹配页面的链接        #region 匹配页面的图片地址        /// <summary>        /// 匹配页面的图片地址        /// </summary>        /// <param name="HtmlCode"></param>        /// <param name="imgHttp">要补充的http://路径信息</param>        /// <returns></returns>        public string GetImgSrc(string HtmlCode, string imgHttp)        {            string MatchVale = "";            string Reg = @"<img.+?>";            foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))            {                MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";            }            return MatchVale;        }        /// <summary>        /// 匹配<img src="" />中的图片路径实际链接        /// </summary>        /// <param name="ImgString"><img src="" />字符串</param>        /// <param name="imgHttp"></param>        /// <returns></returns>        public string GetImg(string ImgString, string imgHttp)        {            string MatchVale = "";            string Reg = @"src=.+\.(bmp|jpg|gif|png|)";            foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))            {                MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");            }            if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)                return (MatchVale);            else                return (imgHttp + MatchVale);        }        #endregion 匹配页面的图片地址        #region 抓取远程页面内容        /// <summary>        /// 以GET方式抓取远程页面内容        /// </summary>        public string Get_Http(string tUrl)        {            string strResult;            try            {                HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(tUrl);                hwr.Timeout = 19600;                HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();                Stream myStream = hwrs.GetResponseStream();                StreamReader sr = new StreamReader(myStream, Encoding.Default);                StringBuilder sb = new StringBuilder();                while (-1 != sr.Peek())                {                    sb.Append(sr.ReadLine() + "\r\n");                }                strResult = sb.ToString();                hwrs.Close();            }            catch (Exception ee)            {                strResult = ee.Message;            }            return strResult;        }        /// <summary>        /// 以POST方式抓取远程页面内容        /// </summary>        /// <param name="url"></param>        /// <param name="postData">参数列表</param>        /// <param name="encodeType"></param>        /// <returns></returns>        public string Post_Http(string url, string postData, string encodeType)        {            string strResult;            try            {                Encoding encoding = Encoding.GetEncoding(encodeType);                byte[] POST = encoding.GetBytes(postData);                HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);                myRequest.Method = "POST";                myRequest.ContentType = "application/x-www-form-urlencoded";                myRequest.ContentLength = POST.Length;                Stream newStream = myRequest.GetRequestStream();                newStream.Write(POST, 0, POST.Length); //设置POST                newStream.Close();                HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();                StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);                strResult = reader.ReadToEnd();            }            catch (Exception ex)            {                strResult = ex.Message;            }            return strResult;        }        #endregion 抓取远程页面内容        #region 压缩HTML输出        /// <summary>        /// 压缩HTML输出        /// </summary>        public string ZipHtml(string Html)        {            Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符            Html = Regex.Replace(Html, @"\r\n\s*", "");            Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);            return Html;        }        #endregion 压缩HTML输出        #region 过滤指定HTML标签        /// <summary>        /// 过滤指定HTML标签        /// </summary>        /// <param name="s_TextStr">要过滤的字符</param>        /// <param name="html_Str">a img p div</param>        public string DelHtml(string s_TextStr, string html_Str)        {            string rStr = "";            if (!string.IsNullOrEmpty(s_TextStr))            {                rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase);                rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase);            }            return rStr;        }        #endregion 过滤指定HTML标签    }}
 |