YNOTES笔记

### 本文讲解通过python selenium firefox mysql的方式爬取搜狗微信公众号数据 `说明:搜狗微信的反爬虫，scrapy框架爬取易被检测,使用selenium的方式（缺点：慢。优点:不易被检测到）。` 安装相关软件教程参考:https://ynotes.cn/blog/article_detail/158 #### 流程: 1.脚本循环查询关键字表(table keys)中关键字类型字段(column type)所对应的关键字字段(column keyword)前100条数据 2.通过获取关键字循环去搜狗微信去搜索 3.爬取搜狗搜索出来的微信公众号 4.判断页面是否有分页，有则循环爬取。爬取完一个页面，更新爬取页面数字段(column page_num),所有页面更新关键字表的状态字段(column status[0:表示未爬取,1:表示已爬取]) 5.对爬取出来的数据插入到微信公众号数据表(weixin_data)(建相关数据表) 6.更新关键字表的状态为已爬取状态 #### 数据表结构 ```sql CREATE TABLE `keys` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `keyword` varchar(255) DEFAULT NULL, `page_num` int(11) DEFAULT '0', `status` int(11) DEFAULT '0' COMMENT '0 未搜索 1 已搜索 99 丢弃', `type` varchar(255) DEFAULT NULL, `is_drop` int(11) NOT NULL DEFAULT '0', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=119750 DEFAULT CHARSET=utf8; CREATE TABLE `weixin_data` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `key_id` int(255) DEFAULT NULL, `weixin_name` varchar(255) DEFAULT NULL, `weixin_account` varchar(255) DEFAULT NULL, `weixin_auth_info` varchar(255) DEFAULT NULL, `is_auth` int(11) DEFAULT NULL, `describe` varchar(6000) DEFAULT NULL, `img_url` varchar(255) DEFAULT NULL, `loc_info` varchar(255) DEFAULT NULL, `privince` varchar(255) DEFAULT NULL, `city` varchar(255) DEFAULT NULL, `district` varchar(255) DEFAULT NULL, `weixin_type` varchar(255) DEFAULT NULL, `other` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `weixin_account` (`weixin_account`) ) ENGINE=InnoDB AUTO_INCREMENT=139746 DEFAULT CHARSET=utf8; ``` #### 爬虫脚本 scrapy_sogou.py ```python #coding=utf-8 from selenium import webdriver import time from selenium.common.exceptions import NoSuchElementException,TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import pymysql import random 指定页面 #参数指定了缓存文件的路径，方便爬取需要登录的网站 profile = webdriver.FirefoxProfile(r'C:\Users\Administrator.GZLX-20180416SV\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default') #如果不需要cookie，则不需要指定,使用下面的配置 #profile = webdriver.FirefoxProfile() #禁止加载样式表 profile.set_preference("permissions.default.stylesheet",2) #禁止加载图片 profile.set_preference("permissions.default.image",2) #禁止加载JAVASCRIPT profile.set_preference("javascript.enabled",False) #设置代理 profile.set_preference('network.proxy.type', 1) profile.set_preference('network.proxy.http', 'xx.xx.xx.xx') profile.set_preference('network.proxy.http_port', xxxx) profile.set_preference('network.proxy.ssl', 'xx.xx.xx.xx') profile.set_preference('network.proxy.ssl_port', xxxx) profile.update_preferences() #数据库配置 db_host="xx.xx.xx.xx" db_user="root" db_pass="xxxx" db_port=3306 db_name="weixin_data" #指定Firefox的驱动 driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver") #搜索的关键字 key_search_list=['学校'] index_url='https://weixin.sogou.com/weixin?query=' keys_search_string="" for index in range(0,len(key_search_list)): if index==len(key_search_list)-1: keys_search_string+="'"+key_search_list[index]+"'" else: keys_search_string+="'"+key_search_list[index]+"'," class AnyEc: """ Use with WebDriverWait to combine expected_conditions in an OR. """ def __init__(self, *args): self.ecs = args def __call__(self, driver): for fn in self.ecs: try: if fn(driver): return True except: pass def execute_query_sql(sql): #循环读取数据库状态是0的关键字100个 db= pymysql.connect(host=db_host,port=db_port,user=db_user, passwd=db_pass, db=db_name) # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() #执行sql cursor.execute(sql) results=cursor.fetchall() # 关闭数据库连接 db.close() return results def execute_update_sql(sql): #循环读取数据库状态是0的关键字100个 db= pymysql.connect(host=db_host,port=db_port,user=db_user, passwd=db_pass, db=db_name) # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() # 执行sql语句 cursor.execute(sql) # 提交到数据库执行 db.commit() #执行sql # 关闭数据库连接 db.close() #爬取网站内容的函数 def parseWeb(driver,key_name,page_N): print("开始提取关键字"+key_name+",第"+str(page_N)+"页的数据:") for page in driver.find_elements_by_xpath('//ul[@class="news-list2"]/li'): weixin_name=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="tit"]/a').text img_url=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="img-box"]/a/img').get_attribute("src") weixin_account=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="info"]/label').text weixin_auth_info="" try: page.find_element_by_xpath('./dl[2]/dt[contains(text(),微信认证)]') dl_info=page.find_element_by_xpath('./dl[2]/dt').text if '微信认证' in dl_info: weixin_auth_info=page.find_element_by_xpath('.//dl[2]/dd').text except NoSuchElementException: weixin_auth_info="" print("微信认证:"+weixin_auth_info) try: page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="tit"]/i') is_auth=1 except NoSuchElementException: is_auth=0 try: describe=page.find_element_by_xpath('.//dl[1]/dd').text except NoSuchElementException: describe="" #把数据插入酷内 insert_sql='insert into weixin_data(key_id,weixin_name,weixin_account,weixin_auth_info,is_auth,img_url,`describe`,weixin_type,other) values('+str(key_id)+',"'+weixin_name+'","'+weixin_account+'","'+weixin_auth_info+'",'+str(is_auth)+',"'+img_url+'","'+describe+'","'+'培训机构'+'",'+'NULL'+'); ' #print(insert_sql) try: print("准备插入数据:"+weixin_name) execute_update_sql(insert_sql) except: print("插入数据异常,可能是重复数据") #更新当前页数 update_sql='update `keys` set page_num='+str(page_N)+' where keyword="'+key_name+'";' print(update_sql) try: execute_update_sql(update_sql) except: print("更新爬取页数错误") return False return True #判断页面是否加载完成 def pageIsLoadFinished(driver): try: WebDriverWait(driver, 10).until( AnyEc( EC.presence_of_element_located( (By.XPATH, u'//div[@class="gzh-box2"]/div[@class="img-box"]/a/img')), EC.presence_of_element_located( (By.XPATH, u'//p[@class="ip-time-p"]')), EC.presence_of_element_located( (By.XPATH, u'//div[@id="noresult_part1_container"]')) )) return True except TimeoutException: return False #页面是否正常 def pageIsNomal(driver): try: driver.find_element_by_xpath('//p[@class="ip-time-p"]') print("IP访问频繁，准备重启浏览器") time.sleep(3) return False except NoSuchElementException: return True #页面是否404 def pageIsNotFound(driver,key_name): try: driver.find_element_by_xpath('//div[@id="noresult_part1_container"]') print("关键字"+key_name+"没有找到，搜索下一个关键字") return True except NoSuchElementException: return False #跳到指定页 def jumpNumPage(driver,page_N): #判断是否是当前页 try: current_page=driver.find_element_by_xpath('//div[@id="pagebar_container"]/span').text if int(page_N) == int(current_page): print("已经在当前页，无需跳转") return True except: print("没有当前页"+str(page_N)) return False try: driver.find_element_by_xpath('//div[@id="pagebar_container"]/a[@id="sogou_page_'+str(page_N)+'"]').click() except NoSuchElementException: print("没有第"+str(page_N)+"页面") return False return True #跳到下一页 def jumpNextPage(driver): try: driver.find_element_by_xpath('//div[@id="pagebar_container"]/a[@id="sogou_next"]').click() except NoSuchElementException: print("没有下一页") return False return True #页面是否准备好 def PageIsReady(driver,key_name,page_N): #判断页面已经加载完成，并且不存在ip频繁访问页面 if pageIsLoadFinished(driver) and pageIsNomal(driver): #判断页面不存在指定的标签页 if not jumpNumPage(driver,page_N): #判断页面是否404 if pageIsNotFound(driver,key_name): #更新数据库关键字字段 update_status_sql='update `keys` set status=1 where keyword="'+key_name+'";' try: execute_update_sql(update_status_sql) except: print("更新关键字"+key_name+"的状态失败!!!") return True else: return False return True #循环抓取 while True: get_keys="SELECT id,keyword FROM keys where status=0 and is_drop=0 and type in ("+keys_search_string+") limit 100;" print(get_keys) print("获取关键字中...") try: results=execute_query_sql(get_keys) except: print("数据库查询关键字失败，停止爬虫") break print("关键字查找完成") #生成url id_keys=[ re for re in results ] for id_key in id_keys: key_id=id_key[0] key_name=id_key[1] url=index_url+key_name print("开始爬取:"+url) try: driver.get(url) except TimeoutException: continue #获取爬取key的页数 get_page_sql='select page_num from `keys` where id='+str(key_id)+';' page_N=execute_query_sql(get_page_sql)[0][0]+1 if PageIsReady(driver,key_name,page_N): if not parseWeb(driver,key_name,page_N): continue else: time.sleep(1) driver.close() driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver") continue #跳转到当前页爬取 #爬取完当前页更新key关键字 #判断是否有下一页继续爬取,如果只爬取一页，则注释下面的代码 ##########是否爬取搜索关键字的所有页面--start isOk=True while jumpNextPage(driver): get_page_sql='select page_num from `keys` where id='+str(key_id)+';' page_N=execute_query_sql(get_page_sql)[0][0]+1 if PageIsReady(driver,key_name,page_N): if not parseWeb(driver,key_name,page_N): isOk=False break else: isOk=False break if not isOk: time.sleep(1) driver.close() driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver") continue ##########是否爬取搜索关键字的所有页面--end #更新关键字状态 update_status_sql='update `keys` set status=1 where keyword="'+key_name+'";' try: execute_update_sql(update_status_sql) except: print("更新数据库关键字"+key_name+"的状态发生错误") ```