澳门新浦京娱乐场网站-www.146.net-新浦京娱乐场官网
做最好的网站

并行下载天涯图片,多线程以及抓取图片

       最近因为一个作业需要完成CNKI爬虫,研究爬虫架构的时候发现了这个疑似移植于Python的著名开源爬虫框架Scrapy的ScrapySharp,然而在网上寻找之后只发现了这个F#的Demo,就使用原文中示例的网站写了这个C#版本的代码。

       PS:研究之后发现,ScrapySharp和Scrapy差距还是挺大的,没有Scrapy那样完善的八大组件,只含有获取网页内容和基于HtmlAgilityPack扩展的网页解析功能,莫名有些小失望。

using System;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using HtmlAgilityPack;
using ScrapySharp.Extensions;
using ScrapySharp.Network;


namespace ScrapySharpDemo { class Program { static void Main(string[] args) { //示例网站地址 var url = ""; var web = new ScrapingBrowser(); var html = web.DownloadString(new Uri(url)); var doc = new HtmlDocument(); doc.LoadHtml(html); //获取网站中的图片地址 var urls= doc.DocumentNode.CssSelect("div.bbs-content > img").Select(node => node.GetAttributeValue("original")).ToList(); //并行下载图片 Parallel.ForEach(urls, SavePic); }
public static void SavePic(string url) { var web = new ScrapingBrowser(); //因天涯网站限制,所有站外来源都无法访问图片,故先设置请求头Refer属性为当前页地址 web.Headers.Add("Referer", ""); var pic = web.NavigateToPage(new Uri(url)).RawResponse.Body; var file = url.Substring(url.LastIndexOf("/", StringComparison.Ordinal)); if (!Directory.Exists("imgs")) Directory.CreateDirectory("imgs"); File.WriteAllBytes("imgs" file, pic); } } }

今日看博客园发现一个不错的抓取贴(主要是那个url。。。你懂的),花几分钟改了下,代码增加了按年月日建立目录,按文章建立子目录,图片都保存于内,命令行方式运行,增加了全站的参数。。。

正则表达式

 public partial class CameraFrm : Form
    {
        //private IOrderSyncServices orderBE = AccessOrderSync.GetOrderSyncServices();
        //private string sysCode = "";
        private string url = @"";
        public CameraFrm()
        {
            InitializeComponent();
            //SystemInfo si = new SystemInfo();
            //this.sysCode = si.getRNum();
            //DataSet ds = SQLiteHelper.Query("select sub_guid,openid from sc_license where code='" this.sysCode "'");
            //string guid = ds.Tables[0].Rows[0]["openid"].ToString();//总店的GUID FA3BF2D8-1FC7-3B6F-90F0-60AA22525305
            //string suid = ds.Tables[0].Rows[0]["sub_guid"].ToString();//分店GUID 19DBBC08-D4C9-CE21-879A-8927F8DC8C99
            System.Windows.Forms.Control.CheckForIllegalCrossThreadCalls = true;
            Control.CheckForIllegalCrossThreadCalls = false;
        }
        /// <summary>
        /// 摄像头ID
        /// </summary>
        public string cameraid { get; set; }//摄像头ID
        public string ACTIVE { get; set; }//是否启动摄像头start=启动
        public string guid { get; set; }//总店GUID
        public string suid { get; set; }//分店GUID
        private bool falg { get; set; }//是否启动
        private string sPath { get; set; }//摄像头抓取图片存放地址
        private string cameraFile { get; set; }//摄像头地址
        private void CameraFrm_Load(object sender, EventArgs e)
        {
            this.ImageFile();
            this.ResultCameraFile(this.cameraid);
            this.timer1.Enabled = true;
            if (this.ACTIVE == "start")
            {
                this.falg = true;
                this.CreateUpdata();
            }
            else
            {
                this.falg = false;
                Application.ExitThread();
                this.Close();
            }
        }
        private int count = 0;
        private string point = "";
        private string strs = "图片上传中";
        private void timer2_Tick(object sender, EventArgs e)
        {
            this.label1.Text = this.strs this.point;
            this.point = this.point ".";
            if (this.count >= 5)
            {
                this.count = 0;
                this.point = ".";
            }
            this.count ;
        }
        public void CreateUpdata()
        {
            try
            {
                Thread mainThread = new Thread(new ThreadStart(CreateThread));
                mainThread.Start();
            }
            catch (Exception ex)
            {
                Application.ExitThread();
                this.Close();
            }
        }
        private void CreateThread()
        {
            try
            {
                while (falg)
                {
                    Thread minorThread = new Thread(new ThreadStart(ImageCameralUpload));
                    minorThread.Start();
                    Thread.Sleep(700);
                }
            }
            catch (Exception ex)
            {
                Application.ExitThread();
                this.Close();
            }
        }
        /// <summary>
        ///向服务器上传摄像头所拍的照片
        /// </summary>
        public void ImageCameralUpload()
        {
            try
            {
                string sign = "";// ResultPostArguments(appid, cameraid, guid, suid);
                string OpathImage = this.GetPicUrl();
                string str = PostData(this.url, sign, this.guid, this.suid, this.cameraid, OpathImage);
                if (str != "")
                {
                    var jObject = JObject.Parse(str);
                    if (jObject["code"].ToString()=="0")
                    {
                       File.Delete(OpathImage);
                    }
                    else
                    {
                        this.falg = false;
                        timer1.Enabled = false;
                        Thread.Sleep(5000);
                        File.Delete(OpathImage);
                        Application.ExitThread();
                        this.Close();
                    }
                }
            }
            catch (Exception ex)
            {
                Application.ExitThread();
                this.Close();
            }
        }
        //获取签名
        private string ResultPostArguments(string appid, string cameraid, string guid, string suid)
        {
            int count = 0;
            StringBuilder builder = new StringBuilder();
            Dictionary<string, string> dic = new Dictionary<string, string>();
            dic.Add("cameraid", cameraid);
            dic.Add("guid", guid);
            dic.Add("suid", suid);
            //升序排列
            var dicSort = from objDic in dic orderby objDic.Key select objDic;
            foreach (KeyValuePair<string, string> kvp in dicSort)
            {
                count ;
                builder.Append(kvp.Key "=" kvp.Value);
                if (count != dic.Count)
                {
                    builder.Append("&");
                }
            }
            string appkey = "f6ac54df8aee6e3c0a4d1ac0b69db6f7";
            string sign = md5(builder.ToString() appid appkey).ToLower();
            return sign;
        }
        //MD5加密
        private string md5(String input)
        {
            MD5CryptoServiceProvider Md5 = new MD5CryptoServiceProvider();
            return BitConverter.ToString(Md5.ComputeHash(Encoding.UTF8.GetBytes(input))).Replace("-", "");
        }
        /// <summary>
        /// 向服务器上传图片
        /// </summary>
        /// <param name="url">服务器地址</param>
        /// <param name="sign">签名</param>
        /// <param name="appId">APPID</param>       
        /// <param name="jpegPath">图片地址</param>
        /// <returns></returns>
        public string PostData(string url, string sign, string guid, string suid, string cameraid, string imageFile)
        {
            try
            {
                FileStream file = new FileStream(imageFile, FileMode.Open);
                byte[] bb = new byte[file.Length];
                file.Read(bb, 0, (int)file.Length);
                file.Close();
                
                MultiPartFormData form = new MultiPartFormData();
                form.AddFormField("sign", sign);
                form.AddFormField("guid", guid);
                form.AddFormField("suid", suid);
                form.AddFormField("cameraid", cameraid);
                form.AddStreamFile("img", Path.GetFileName(imageFile), bb);
                form.PrepareFormData();
                form.GetFormData();
                string contentType = "multipart/form-data; boundary=" form.Boundary;
                HttpUploadHelper help = new HttpUploadHelper();
                HttpItem item = new HttpItem()
                {
                    URL = url,
                    Accept = "text/*",
                    ContentType = contentType,
                    Method = "POST",
                    PostDataType = PostDataType.Byte,
                    Timeout = 1000000,
                    ReadWriteTimeout = 3000000,
                    PostdataByte = form.GetFormData().ToArray(),
                    Encoding = Encoding.UTF8,
                };
                item.Header.Add("Pragma", "no-cache");
                item.Header.Add("DNT", "1");
                HttpResult result = help.GetHtml(item);
                string html = result.Html;
                return html;
            }
            catch (Exception ex)
            {
                throw;
            }
        }
        /// <summary>
        /// 抓取网络图片保存到本地
        /// </summary>
        /// <returns>本地路径</returns>
        private string GetPicUrl()
        {
            //string sPath = @"DataImg";
            //if (!Directory.Exists(sPath))
            //{
            //    Directory.CreateDirectory(sPath);
            //}
            Bitmap img = null;
            HttpWebRequest req;
            HttpWebResponse res = null;
            try
            {
                System.Uri httpUrl = new System.Uri(this.cameraFile);
                req = (HttpWebRequest)(WebRequest.Create(httpUrl));
                req.Timeout = 180000; //设置超时值10秒
                //req.UserAgent = "XXXXX";
                //req.Accept = "XXXXXX";
                req.Method = "GET";
                res = (HttpWebResponse)(req.GetResponse());
                img = new Bitmap(res.GetResponseStream());//获取图片流                
                img.Save(this.sPath DateTime.Now.ToFileTime().ToString() ".jpg");//随机名
            }
            catch (Exception ex)
            {
                string aa = ex.Message;
            }
            finally
            {
                res.Close();
            }
            return sPath;
        }
        private void button1_Click(object sender, EventArgs e)
        {
            this.falg = true;
           // ImageMessage();
            CreateUpdata();
        }
    
        private void ImageFile()
        {
            this.sPath = @"DataImg";
            if (!Directory.Exists(this.sPath))
            {
                Directory.CreateDirectory(this.sPath);
            }
        }
澳门新浦京娱乐场网站,        private void ResultCameraFile(string cameraid)
        {
            string sql = "select cameralPictureUrl from camera_manager where Id='" cameraid "'AND cameralState='0'";
            this.cameraFile = "";
        }
        private void button2_Click(object sender, EventArgs e)
        {
            this.Close();
        }
    }

原始版本:

     命名空间:using System.Text.RegularExpressions;

利用HtmlAgilityPack抓取XX网站图片并下载~~邪恶版。。。。

     常用的类:  

 

           Regex   

 

           MatchCollection   

新版本代码:

           Match   

 

           Group  

#region Using namespace

           GroupCollection  

using System;
using System.IO;
using System.Linq;
using System.Net;
using HtmlAgilityPack;

    常用的方法:   

#endregion

           Regex.IsMatch(); 返回值bool   

namespace DownloadImages
{
    internal class Program
    {
        private static readonly WebClient Wc = new WebClient();
        private static readonly char[] InvalidFileNameChars = new[]
                                                                  {
                                                                      '"',
                                                                      '<',
                                                                      '>',
                                                                      '|',
                                                                      '',
                                                                      'u0001',
                                                                      'u0002',
                                                                      'u0003',
                                                                      'u0004',
                                                                      'u0005',
                                                                      'u0006',
                                                                      'a',
                                                                      'b',
                                                                      't',
                                                                      'n',
                                                                      'v',
                                                                      'f',
                                                                      'r',
                                                                      'u000e',
                                                                      'u000f',
                                                                      'u0010',
                                                                      'u0011',
                                                                      'u0012',
                                                                      'u0013',
                                                                      'u0014',
                                                                      'u0015',
                                                                      'u0016',
                                                                      'u0017',
                                                                      'u0018',
                                                                      'u0019',
                                                                      'u001a',
                                                                      'u001b',
                                                                      'u001c',
                                                                      'u001d',
                                                                      'u001e',
                                                                      'u001f',
                                                                      ':',
                                                                      '*',
                                                                      '?',
                                                                      '\',
                                                                      '/'
                                                                  };
        public static string CleanInvalidFileName(string fileName)
        {
            fileName = fileName   "";
            fileName = InvalidFileNameChars.Aggregate(fileName, (current, c) => current.Replace(c   "", ""));

           Regex.Match(); 返回值Match   

            if (fileName.Length > 1)
                if (fileName[0] == '.')
                    fileName = "dot"   fileName.TrimStart('.');

           Regex.Matches(); 返回值MatchCollection   

            return fileName;
        }
        private static void Main(string[] args)
        {
            Start();
        }

           Regex.Replace(); 返回值string

        private static void Start()
        {
            var web = new HtmlWeb();
            var startDate = int.Parse(DateTime.Parse("2010-08-18").ToString("yyyyMMdd"));
            var endDate = int.Parse(DateTime.Now.ToString("yyyyMMdd"));
            const int startPageId = 49430;
            const int endPageId = 124621;
            for (int k = startDate; k <= endDate; k )
            {
                for (int j = startPageId; j <= endPageId; j )
                {
                    string cnblogs =    k   "/"   j   ".html";  //此处省略……源码内详
                    HtmlDocument doc = web.Load(cnblogs);
                    var titles = doc.DocumentNode.SelectNodes("//title");
                    var titleName = j.ToString();
                    if( titles!=null && titles.Count>0)
                        titleName = titles[0].InnerText;
                    HtmlNode node = doc.GetElementbyId("ks_xp");
                    if (node == null)
                    {
                        continue;
                    }
                    foreach (HtmlNode child in node.SelectNodes("//img"))
                    {
                        if (child.Attributes["src"] == null)
                            continue;

正则表达式抓取图片:   

                        string imgurl = child.Attributes["src"].Value;
                        DownLoadImg(imgurl, k   "", CleanInvalidFileName(titleName));
                        Console.WriteLine("正在下载:"   titleName   " "   imgurl);
                    }
                }
            }
            //善后
            CleanEmptyFolders();
        }

           引用命名空间:using System.Net;       

        private static void CleanEmptyFolders()
        {
            var rootFolders = Environment.CurrentDirectory   "\Images\";
            var folders = Directory.GetDirectories(rootFolders, "*.*", SearchOption.AllDirectories);
            foreach( var f in folders)
            {
                if (Directory.GetFiles(f, "*.*", SearchOption.AllDirectories).Length == 0)
                    Directory.Delete(f);
            }
        }

           using System.IO;   

        private static void DownLoadImg(string url, string folderName, string subFolderName)
        {
            var fileName = CleanInvalidFileName(url.Substring(url.LastIndexOf("/")   1));
            var fileFolder = Environment.CurrentDirectory   "\Images\"   folderName   "\"   subFolderName   "\" ;
            if (!Directory.Exists(fileFolder))
                Directory.CreateDirectory(fileFolder);
            fileName = fileFolder   fileName;
            try
            {
                Wc.DownloadFile(url, fileName);
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
    }
}

           做题思想:1》首先从网上获取网页上的所有信息,2》使用正则表达式进行匹配获得,想要得到的图片的具体地址,3》下载;      

 

          static void Mian(string[] args)  

测试程序和源码下载:

          {    

/Files/Chinasf/DownloadImages.rar

            WebClient wc=new WebClient();    

            string html=wc.DownloadString(@"网---页---地---址");    

            MatchCollect mc=Regex.Matches(html, @"<s?img[^>] src=""([^""] )""");//使用正则表达式进行匹配,因为所获的图片较多,所以常见一个List集合储存;    

            List<string> pic=new List<string>();    

            foreach(Match m in mc)//进行遍历    

            {    

               if(m.Success)//若是能够匹配的字符串放到pic集合中     

              {      

                pic.Add(m.Group[1].Value.Trim());//获得图片 src="~~~"的形式;提取图片名称     

              }   

             }    

并行下载天涯图片,多线程以及抓取图片。            string url=@"网页地址";    

            for(int i=0;i<pic.Count;i )    

            {    

               string temp=pic[i];    

               temp = url / temp;      //往图片名称前添加url地址;     

               pic[i]=temp; //重新改变pic集合中的图片名称,到此此图片就是一个完整的网页图片地址    

            }    

            string address="想要下载到的目标位置";    

            if(!Directory.Exists(address)) //先进行判断磁盘中是否有要用的文件夹,没有则创建

              {     

               Directory.CreateDirectory("文件");    

            }   

             else    

            {    

                for(int i=0;i<pic.Count.i )     

               {      

                  string name=Regex.Match(pic[i],@"./(. )").Groups[1].Value;      

                  //Regex.Match(pic[i],@"./(. )"); 进行匹配,显示图片名称 "/~~~"的形式;      

                  //Regex.Match(pic[i],@"./(. )").Groups[1].Value   抓取图片名称,这是为了在下载时创建出的名字与网上名字一样;     

                   wc.DownloadFile(pic[i],path.Combine(address,name);//下载完成    

                }    

            }    

          Console.ReadKey();    

         }


本文由澳门新浦京娱乐场网站发布于www.146.net,转载请注明出处:并行下载天涯图片,多线程以及抓取图片