### 本文讲解通过python selenium firefox mysql的方式爬取搜狗微信公众号数据
`说明:搜狗微信的反爬虫,scrapy框架爬取易被检测,使用selenium的方式(缺点:慢。优点:不易被检测到)。`
安装相关软件教程参考:https://ynotes.cn/blog/article_detail/158
#### 流程:
1.脚本循环查询关键字表(table keys)中关键字类型字段(column type)所对应的关键字字段(column keyword)前100条数据
2.通过获取关键字循环去搜狗微信去搜索
3.爬取搜狗搜索出来的微信公众号
4.判断页面是否有分页,有则循环爬取。爬取完一个页面,更新爬取页面数字段(column page_num),所有页面更新关键字表的状态字段(column status[0:表示未爬取,1:表示已爬取])
5.对爬取出来的数据插入到微信公众号数据表(weixin_data)(建相关数据表)
6.更新关键字表的状态为已爬取状态
#### 数据表结构
```sql
CREATE TABLE `keys` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`keyword` varchar(255) DEFAULT NULL,
`page_num` int(11) DEFAULT '0',
`status` int(11) DEFAULT '0' COMMENT '0 未搜索 1 已搜索 99 丢弃',
`type` varchar(255) DEFAULT NULL,
`is_drop` int(11) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=119750 DEFAULT CHARSET=utf8;
CREATE TABLE `weixin_data` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`key_id` int(255) DEFAULT NULL,
`weixin_name` varchar(255) DEFAULT NULL,
`weixin_account` varchar(255) DEFAULT NULL,
`weixin_auth_info` varchar(255) DEFAULT NULL,
`is_auth` int(11) DEFAULT NULL,
`describe` varchar(6000) DEFAULT NULL,
`img_url` varchar(255) DEFAULT NULL,
`loc_info` varchar(255) DEFAULT NULL,
`privince` varchar(255) DEFAULT NULL,
`city` varchar(255) DEFAULT NULL,
`district` varchar(255) DEFAULT NULL,
`weixin_type` varchar(255) DEFAULT NULL,
`other` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `weixin_account` (`weixin_account`)
) ENGINE=InnoDB AUTO_INCREMENT=139746 DEFAULT CHARSET=utf8;
```
#### 爬虫脚本
scrapy_sogou.py
```python
#coding=utf-8
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException,TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
import random
指定页面
#参数指定了缓存文件的路径,方便爬取需要登录的网站
profile = webdriver.FirefoxProfile(r'C:\Users\Administrator.GZLX-20180416SV\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default')
#如果不需要cookie,则不需要指定,使用下面的配置
#profile = webdriver.FirefoxProfile()
#禁止加载样式表
profile.set_preference("permissions.default.stylesheet",2)
#禁止加载图片
profile.set_preference("permissions.default.image",2)
#禁止加载JAVASCRIPT
profile.set_preference("javascript.enabled",False)
#设置代理
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', 'xx.xx.xx.xx')
profile.set_preference('network.proxy.http_port', xxxx)
profile.set_preference('network.proxy.ssl', 'xx.xx.xx.xx')
profile.set_preference('network.proxy.ssl_port', xxxx)
profile.update_preferences()
#数据库配置
db_host="xx.xx.xx.xx"
db_user="root"
db_pass="xxxx"
db_port=3306
db_name="weixin_data"
#指定Firefox的驱动
driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver")
#搜索的关键字
key_search_list=['学校']
index_url='https://weixin.sogou.com/weixin?query='
keys_search_string=""
for index in range(0,len(key_search_list)):
if index==len(key_search_list)-1:
keys_search_string+="'"+key_search_list[index]+"'"
else:
keys_search_string+="'"+key_search_list[index]+"',"
class AnyEc:
""" Use with WebDriverWait to combine expected_conditions
in an OR.
"""
def __init__(self, *args):
self.ecs = args
def __call__(self, driver):
for fn in self.ecs:
try:
if fn(driver): return True
except:
pass
def execute_query_sql(sql):
#循环读取数据库状态是0的关键字100个
db= pymysql.connect(host=db_host,port=db_port,user=db_user, passwd=db_pass, db=db_name)
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
#执行sql
cursor.execute(sql)
results=cursor.fetchall()
# 关闭数据库连接
db.close()
return results
def execute_update_sql(sql):
#循环读取数据库状态是0的关键字100个
db= pymysql.connect(host=db_host,port=db_port,user=db_user, passwd=db_pass, db=db_name)
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 执行sql语句
cursor.execute(sql)
# 提交到数据库执行
db.commit()
#执行sql
# 关闭数据库连接
db.close()
#爬取网站内容的函数
def parseWeb(driver,key_name,page_N):
print("开始提取关键字"+key_name+",第"+str(page_N)+"页的数据:")
for page in driver.find_elements_by_xpath('//ul[@class="news-list2"]/li'):
weixin_name=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="tit"]/a').text
img_url=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="img-box"]/a/img').get_attribute("src")
weixin_account=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="info"]/label').text
weixin_auth_info=""
try:
page.find_element_by_xpath('./dl[2]/dt[contains(text(),微信认证)]')
dl_info=page.find_element_by_xpath('./dl[2]/dt').text
if '微信认证' in dl_info:
weixin_auth_info=page.find_element_by_xpath('.//dl[2]/dd').text
except NoSuchElementException:
weixin_auth_info=""
print("微信认证:"+weixin_auth_info)
try:
page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="tit"]/i')
is_auth=1
except NoSuchElementException:
is_auth=0
try:
describe=page.find_element_by_xpath('.//dl[1]/dd').text
except NoSuchElementException:
describe=""
#把数据插入酷内
insert_sql='insert into weixin_data(key_id,weixin_name,weixin_account,weixin_auth_info,is_auth,img_url,`describe`,weixin_type,other) values('+str(key_id)+',"'+weixin_name+'","'+weixin_account+'","'+weixin_auth_info+'",'+str(is_auth)+',"'+img_url+'","'+describe+'","'+'培训机构'+'",'+'NULL'+'); '
#print(insert_sql)
try:
print("准备插入数据:"+weixin_name)
execute_update_sql(insert_sql)
except:
print("插入数据异常,可能是重复数据")
#更新当前页数
update_sql='update `keys` set page_num='+str(page_N)+' where keyword="'+key_name+'";'
print(update_sql)
try:
execute_update_sql(update_sql)
except:
print("更新爬取页数错误")
return False
return True
#判断页面是否加载完成
def pageIsLoadFinished(driver):
try:
WebDriverWait(driver, 10).until( AnyEc(
EC.presence_of_element_located(
(By.XPATH, u'//div[@class="gzh-box2"]/div[@class="img-box"]/a/img')),
EC.presence_of_element_located(
(By.XPATH, u'//p[@class="ip-time-p"]')),
EC.presence_of_element_located(
(By.XPATH, u'//div[@id="noresult_part1_container"]'))
))
return True
except TimeoutException:
return False
#页面是否正常
def pageIsNomal(driver):
try:
driver.find_element_by_xpath('//p[@class="ip-time-p"]')
print("IP访问频繁,准备重启浏览器")
time.sleep(3)
return False
except NoSuchElementException:
return True
#页面是否404
def pageIsNotFound(driver,key_name):
try:
driver.find_element_by_xpath('//div[@id="noresult_part1_container"]')
print("关键字"+key_name+"没有找到,搜索下一个关键字")
return True
except NoSuchElementException:
return False
#跳到指定页
def jumpNumPage(driver,page_N):
#判断是否是当前页
try:
current_page=driver.find_element_by_xpath('//div[@id="pagebar_container"]/span').text
if int(page_N) == int(current_page):
print("已经在当前页,无需跳转")
return True
except:
print("没有当前页"+str(page_N))
return False
try:
driver.find_element_by_xpath('//div[@id="pagebar_container"]/a[@id="sogou_page_'+str(page_N)+'"]').click()
except NoSuchElementException:
print("没有第"+str(page_N)+"页面")
return False
return True
#跳到下一页
def jumpNextPage(driver):
try:
driver.find_element_by_xpath('//div[@id="pagebar_container"]/a[@id="sogou_next"]').click()
except NoSuchElementException:
print("没有下一页")
return False
return True
#页面是否准备好
def PageIsReady(driver,key_name,page_N):
#判断页面已经加载完成,并且不存在ip频繁访问页面
if pageIsLoadFinished(driver) and pageIsNomal(driver):
#判断页面不存在指定的标签页
if not jumpNumPage(driver,page_N):
#判断页面是否404
if pageIsNotFound(driver,key_name):
#更新数据库关键字字段
update_status_sql='update `keys` set status=1 where keyword="'+key_name+'";'
try:
execute_update_sql(update_status_sql)
except:
print("更新关键字"+key_name+"的状态失败!!!")
return True
else:
return False
return True
#循环抓取
while True:
get_keys="SELECT id,keyword FROM keys where status=0 and is_drop=0 and type in ("+keys_search_string+") limit 100;"
print(get_keys)
print("获取关键字中...")
try:
results=execute_query_sql(get_keys)
except:
print("数据库查询关键字失败,停止爬虫")
break
print("关键字查找完成")
#生成url
id_keys=[ re for re in results ]
for id_key in id_keys:
key_id=id_key[0]
key_name=id_key[1]
url=index_url+key_name
print("开始爬取:"+url)
try:
driver.get(url)
except TimeoutException:
continue
#获取爬取key的页数
get_page_sql='select page_num from `keys` where id='+str(key_id)+';'
page_N=execute_query_sql(get_page_sql)[0][0]+1
if PageIsReady(driver,key_name,page_N):
if not parseWeb(driver,key_name,page_N):
continue
else:
time.sleep(1)
driver.close()
driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver")
continue
#跳转到当前页爬取
#爬取完当前页更新key关键字
#判断是否有下一页继续爬取,如果只爬取一页,则注释下面的代码
##########是否爬取搜索关键字的所有页面--start
isOk=True
while jumpNextPage(driver):
get_page_sql='select page_num from `keys` where id='+str(key_id)+';'
page_N=execute_query_sql(get_page_sql)[0][0]+1
if PageIsReady(driver,key_name,page_N):
if not parseWeb(driver,key_name,page_N):
isOk=False
break
else:
isOk=False
break
if not isOk:
time.sleep(1)
driver.close()
driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver")
continue
##########是否爬取搜索关键字的所有页面--end
#更新关键字状态
update_status_sql='update `keys` set status=1 where keyword="'+key_name+'";'
try:
execute_update_sql(update_status_sql)
except:
print("更新数据库关键字"+key_name+"的状态发生错误")
```