YNOTES笔记

### 流程: - 1.用户在线编辑或者上传代码 - 2.用户点击执行按钮 - 2.1 首先把页面的代码保存到执行目录 - 2.2 ajax调用代码运行API (注:获取访问pygame的链接) - 2.2.1 API收到请求做处理 - 2.2.1.1 运行pygame(docker)生成可访问链接 - 2.2.2 API返回链接 - 2.3 ajax页面获取到API调用结果 - 2.4 把链接加载到页面Iframe展示 &emsp; ### 上传代码和保存代码 #### 上传功能代码视图view.py(注:只提取功能相关代码) ```python @login_required def pygame(request): upload_error='' code='' code_dir='' if request.method == 'POST': myfile=request.FILES.get('myfile', None) if myfile: myfile = request.FILES['myfile'] fs = FileSystemStorage() file_path=os.path.join('pygame',str(request.user.id)) filename = fs.save(file_path, myfile) temp_dir=os.path.join(os.path.dirname(fs.path(filename)),str(request.user.id),'tempDir') #判断临时目录是否存在存在则删除目录，创建目录 if os.path.exists(temp_dir): shutil.rmtree(temp_dir) os.mkdir(temp_dir) #判断文件是否为zip if zipfile.is_zipfile(fs.path(filename)): zip_ref = zipfile.ZipFile(fs.path(filename), 'r') zip_ref.extractall(temp_dir) is_exist_main_file=0 for fname in zip_ref.namelist(): if 'main.py' in fname: main_file=os.path.join(temp_dir,fname) is_exist_main_file=1 if is_exist_main_file: code=fs.open(main_file,'r').read() code_dir=os.path.dirname(main_file) else: upload_error='压缩包根目录下不存在main.py' zip_ref.close() fs.delete(filename) else: upload_error='文件不是压缩文件' return render(request, 'blog/pygame.html',{'upload_error': upload_error,'code':code,'code_dir':code_dir}) ``` &emsp; 模板blog/pygame.html(注:只提取功能相关代码) ```html <form method="post" class="form-inline" enctype="multipart/form-data"> {% csrf_token %} <div class="form-group"> <input type="file" id="myfile" name="myfile"> </div> <div class="form-group"> <button class="btn btn-primary" type="submit" >上传</button> </div> <label for="myfile">注意:请上传zip压缩包，且压缩包根目录包含main.py</label> </form> ``` &emsp; ### 保存功能代码 #### 视图view.py(注:只提取功能相关代码) ```python def pygame_save_code(request): code_dir=request.POST.get('codeDir','') code_content=request.POST.get('codeContent','') if not code_content: HttpResponse('{"error":"代码为空"}') #判断是否有codeDir，如果有则直接写入到当前目录，如果没有写入一个用户随机目录 if not code_dir: datetime_str=datetime.now().strftime('%Y_%m_%d_%H_%M_%S') create_dir=os.path.join(django_settings.MEDIA_ROOT,'pygame',str(request.user.id),datetime_str) os.mkdir(create_dir) code_dir=create_dir #把代码写入文件，返回写入的目录 file=open(os.path.join(code_dir,'main.py'),'w+') file.write(code_content) file.close() return HttpResponse({'{"NewCodeDir":"'+code_dir+'"}'}) ``` &emsp; #### 模板blog/pygame.html(注:只提取功能相关代码) ```html //代码编辑窗口 <textarea class="form-control line_number" id="yourcode" cols="80" rows="30"></textarea> //pygame展示窗口 <iframe id="pygame_iframe" src="" ></iframe> <button class="btn btn-success" id="pygame_run" type="button">运行</button>:write //处理代码运行逻辑 $('#pygame_run').click(function() { var code_content=$("#yourcode").val(); var codedir=$("#codeDir").val(); var token = $('input[name=csrfmiddlewaretoken]').val(); //调用保存编辑pygame代码API $.post('/blog/pygame_save_code/', { codeDir: codedir, codeContent: code_content, csrfmiddlewaretoken: token}, function(returnedData){ var obj = jQuery.parseJSON(returnedData); var NewCodeDir=obj.NewCodeDir; $("#codeDir").val(NewCodeDir); setTimeout( function() { var codedir=$("#codeDir").val(); //调用执行pygame程序的API $.post('https://pygame.ynotes.cn:3443/pygame/', { codeDir: codedir}, function(returnedData){ var obj = jQuery.parseJSON(returnedData); if(obj.hasOwnProperty('url')){ $('#pygame_iframe').attr('src', obj.url); $('#pygame_text').text('运行成功'); $('#pygame_text').css('color', 'green'); } if(obj.hasOwnProperty('error')){ console.log(obj.error); $('#pygame_text').text(obj.error); $('#pygame_text').css('color', 'red'); } }).fail(function(){ console.log("error"); }); }, 100); }).fail(function(){ console.log("error"); }); }); ``` &emsp; ### 生成访问pygame链接 #### 视图view.py ```python def index(request): code_dir=request.POST.get('codeDir','nodir') command='python /data/app/pygame/pygame_url.py '+code_dir p = subprocess.Popen(command,stdout=subprocess.PIPE,shell=True) pygame_out=p.stdout.read() return HttpResponse(pygame_out) ``` &emsp; #### 生成pygame访问链接的python脚本 pygame_url.py ```python #coding:utf-8 import sys import os import random import subprocess import hashlib import time pygame_image='codemax/pygame:v3' sample_url='https://pygame.ynotes.cn:2443/vnc_lite.html?path=?token=' if len(sys.argv) < 2: print('{ "error":"请指定pygame程序目录"}') exit(1) pygame_dir=sys.argv[1] if not os.path.isdir(pygame_dir): print('{ "error":"'+pygame_dir+'程序目录不存在."}') exit(1) pygame_main=os.path.join(pygame_dir,'main.py') if not os.path.exists(pygame_main): print('{ "error":"'+pygame_dir+'程序目录不存在文件main.py."}') exit(1) source = open(pygame_main, 'r').read() + '\n' try: code_obj=compile(source, pygame_main, 'exec') except Exception as e: print('{"error":"'+str(e)+'"}') exit(1) def get_free_random_port(): command="netstat -ltnp |grep x11vnc|awk '{ print $4}'|awk -F: '{ print $NF}'" p = subprocess.Popen(command,stdout=subprocess.PIPE,stderr=subprocess.STDOUT,shell=True) usage_port_list=[ int(port) for port in p.stdout.read().split('\n') if port != '' ] free_post_list=set(range(5000,6000))-set(usage_port_list) return random.choice(list(free_post_list)) free_random_port=get_free_random_port() docker_out = subprocess.Popen(['docker', 'run', '-d', '-it','--cpus=0.2','-p', str(free_random_port)+':5900', '-v', pygame_dir+':/run_game', pygame_image], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout,stderr = docker_out.communicate() time.sleep(1) containter_id=stdout.strip() p1 = subprocess.Popen('docker logs --tail 100 '+containter_id,stdout=subprocess.PIPE,shell=True) p2 = subprocess.Popen('grep -A 1000 Traceback',stdin=p1.stdout,stdout=subprocess.PIPE,shell=True) docker_out=p2.stdout.read() if docker_out: print('{"error":"'+docker_out.replace('\r\n','\\n').replace('"','\'')+'"}') else: token=hashlib.md5(str(free_random_port)).hexdigest() url=sample_url+token+'&password=password' print('{"url":"'+url+'"}') ``` &emsp; ### 构建pygame镜像(拥有运行pygame代码) #### DockerFile ```yaml FROM ubuntu:xenial LABEL maintainer "sheyinsong@unotes.co" EXPOSE 5900 6099 ENV \ DEBIAN_FRONTEND="nonintractive" \ X11VNC_PASSWORD="password" RUN apt-get update -y RUN apt-get install -y xvfb x11vnc python libglib2.0-0 ADD ./get-pip.py /opt/get-pip.py RUN python /opt/get-pip.py ADD ./pygame-1.9.6-cp27-cp27mu-manylinux1_x86_64.whl /opt/pygame-1.9.6-cp27-cp27mu-manylinux1_x86_64.whl RUN pip install /opt/pygame-1.9.6-cp27-cp27mu-manylinux1_x86_64.whl ADD ./start.sh /opt/start.sh ENTRYPOINT /opt/start.sh ``` #### 文件start.sh ```bash #!/bin/sh Xvfb :1 -screen 0 800x600x16 & sleep 0.1 /usr/bin/x11vnc -display :1.0 -passwd ${X11VNC_PASSWORD:-password} & sleep 0.1 DISPLAY=:1.0 export DISPLAY cd /run_game timeout 3600 python main.py ``` &emsp; ### 构建noVNC镜像(用于html访问pygaem(docker)中的GUI界面) #### DockerFile ```yaml FROM python MAINTAINER sys "sheyinsong@unotes.co" RUN apt-get update -y RUN apt-get upgrade -y RUN apt-get install -y python-numpy RUN apt-get clean RUN pip install redis RUN pip install simplejson ADD ./noVNC/ /noVNC/ CMD ["python", "/noVNC/utils/websockify/run", "--web", "/noVNC", "0.0.0.0:10240", "--target-config", "/noVNC/token.list"] ``` &emsp; #### noVNC下载链接 https://github.com/novnc/noVNC/releases

### 介绍终端使用的是asyncio，单线程支持并发操作。聊天终端,暂时仅提供登陆/注销/发送在线消息/发送离线消息/帮助功能(未完待续) 登陆命令：auth userid password (userid为用户账号,用户数据存储在mysql中) 注销命令：logout 发送消息：msg userid message (支持在线和离线消息，离线消息,当对方不在线时，存储到 redis中,等用户上线之后再推送给该用户) 帮助命令：help/? ### 环境系统:`CentOS7` 编程语言:`Python` 语言解释器:`Cpython-3.7.2` 数据库:`mysql-5.7` `redis-5.0.3` 缓存:`redis-5.0.3` ### 表结构 #### user表 ```sql CREATE TABLE `user` ( `id` int(11) NOT NULL AUTO_INCREMENT, `username` varchar(255) NOT NULL, `password` varchar(255) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=91 DEFAULT CHARSET=utf8; ``` ### 服务器端代码 chat_server.py ```py #coding:utf-8 ###################### # 作者: SheYinsong # # 时间: 2019-03-30 # ###################### import asyncio import aiomysql import aioredis import sys import struct import ssl import time import logging SERVER_ADDRESS = ('0.0.0.0',10000) logging.basicConfig( level=logging.DEBUG, format='%(name)s: %(message)s', stream=sys.stderr, ) log=logging.getLogger('main') #LOOP loop=asyncio.get_event_loop() #TLS配置 CERTFILE='ssl/server.crt' KEYFILE='ssl/server.key' #在线用户userid到读写Stream映射 auth_userid_map_wstream={} auth_wstream_map_userid={} #mysql 配置 db_config={ 'host':'localhost', 'port':3306, 'user':'root', 'password':'xxxxxxxx', 'db':'chat', 'charset':'utf8' } #SQL 模板 QUERY_AUTH_SQL='select password from user where id=%s' QUERY_USERID_IS_EXISTS_SQL='select id from user where id=%s' QUERY_USERNAME_OF_USERID_SQL='select username from user where id=%s' #redis配置 #redis_host='localhost' #redis_port=6379 redis_unix_socket='/var/run/redis/redis.sock' #redis KEY模板 KV_EXISTS_USERID=u'kv_exists_userid:userid:%s' KV_USERID_GET_USERNAME=u'kv_userid_get_username:userid:%s' LIST_USERMESSAGES_OF_USERID=u'list_usermessages_of_userid:userid:%s' #支持的命令列表 cmd_list=['auth','msg','logout'] #命令帮助 cmd_help=""" =============================================== | | | 命令使用说明: | | | =============================================== |命令参数参数 (说明) | =============================================== |auth UID PASSWORD (登录系统) | |msg UID MESSAGE (给用户发送消息) | =============================================== |命令 (说明) | =============================================== |logout (退出) | =============================================== """ async def get_custom_time_string(t_format="%Y-%m-%d %X"): """获取指定格式的时间字符串默认格式为"%Y-%m-%d %X" """ return time.strftime(t_format) async def get_db_pool(): """获取mysql连接池""" pool = await aiomysql.create_pool(**db_config) return pool async def get_redis_pool(): """获取redis连接池""" pool = await aioredis.create_pool( redis_unix_socket, minsize=5, maxsize=10) return pool async def db_query(sql): """mysql执行查询操作""" pool = await get_db_pool() results=() async with pool.acquire() as conn: async with conn.cursor() as cur: log.debug(f'查询数据库,SQL->[ {sql} ]') await cur.execute(sql) results = await cur.fetchall() pool.close() await pool.wait_closed() return results async def redis_query(cmd,*args): """redis执行查询操作""" pool=await get_redis_pool() with await pool as conn: results =await conn.execute(cmd,*args) pool.close() await pool.wait_closed() return results async def redis_dml(cmd,*args): """redis执行增删改操作""" pool=await get_redis_pool() with await pool as conn: await conn.execute(cmd,*args) pool.close() await pool.wait_closed() async def get_username_of_userid(userid): """获取userid对应的用户名""" if await redis_query('exists',KV_USERID_GET_USERNAME % userid): username=await redis_query('get',KV_USERID_GET_USERNAME % userid) return username.decode('utf-8') else: res=await db_query(QUERY_USERNAME_OF_USERID_SQL % userid) if res: username=res[0][0] await redis_dml('set',KV_USERID_GET_USERNAME % userid,username) return username return '' async def get_userid_of_wstream(wstream): """通过writestream获取userid""" try: userid=auth_wstream_map_userid[wstream] return userid except KeyError: return -1 async def get_wstream_of_userid(userid): """通过userid获取writestream""" try: write_stream=auth_userid_map_wstream[userid] return write_stream except KeyError: return '' async def write_data(writer,data): """发送数据""" b_data=bytes(data,encoding='utf-8') b_datasize=struct.pack('H',len(b_data)) try: writer.write(b_datasize+b_data) await writer.drain() return True except: await clean_wstream(writer) return False async def read_data(reader): """读取数据""" try: length=struct.unpack('H',await reader.read(2))[0] b_data=await reader.read(int(length)) return b_data.decode('utf-8') except Exception as e: log.debug(f'读取客户端数据异常:[{e}]') return '' async def clean_wstream(writer): """清理客户端登陆信息和关闭stream""" userid=await get_userid_of_wstream(writer) if userid != -1: try: del auth_userid_map_wstream[userid] except: pass try: del auth_wstream_map_userid[writer] except: pass writer.close() async def user_auth(*args): """用户认证""" userid=int(args[0]) pwd=args[1] sql=QUERY_AUTH_SQL % (userid) res=await db_query(sql) if res: if res[0][0] == pwd: return True return False async def userid_is_exists(userid): """"判断userid是否存在""" if await redis_query('exists',KV_EXISTS_USERID % userid): return True else: res=await db_query(QUERY_USERID_IS_EXISTS_SQL % userid) if res: await redis_dml('set',KV_EXISTS_USERID % userid,'') return True return False async def userid_is_online(userid): """判断用户是否在线""" try: writer=auth_userid_map_wstream[userid] return True except KeyError: return False async def offline_userid(operator_type,*args): """下线用户""" userid=int(args[0]) try: writer=auth_userid_map_wstream[userid] if operator_type == 0: msg='有用户登录您的账号,您已被挤下线！' elif operator_type == 1: msg='您已退出登录！' else: msg='您已退出系统！' await write_data(writer,msg) except Exception as e: log.debug(e) finally: try: del auth_userid_map_wstream[userid] except: log.debug('auth_userid_map_wstream 删除失败') try: del auth_wstream_map_userid[writer] except: log.debug('auth_wstream_map_userid 删除失败') async def send_user_msg(writer,*args): """发送用户消息""" to_userid=int(args[0]) from_userid=auth_wstream_map_userid[writer] content=' '.join(args[1:]) if not await userid_is_exists(to_userid): msg='系统不存在该userid!' await write_data(writer,msg) return from_username=await get_username_of_userid(from_userid) to_username=await get_username_of_userid(to_userid) custom_time=await get_custom_time_string() send_content=f'{custom_time}\n[ {from_username}|UID:{from_userid} ]:{content}' if to_userid == from_userid: send_content=f'{custom_time}\n[ {from_username}|UID:{from_userid} ]:{content}' await write_data(writer,send_content) return if await userid_is_online(to_userid): to_writer=auth_userid_map_wstream[to_userid] if await write_data(to_writer,send_content): await write_data(writer,send_content) return #用户不在线，发送离线消息 print("用户不在线，发送离线消息") await redis_dml('rpush',LIST_USERMESSAGES_OF_USERID % to_userid,send_content) await write_data(writer,f'[离线消息]\n{send_content}') async def user_is_auth(writer): """判断用户是否认证""" if writer in auth_wstream_map_userid.keys(): return True return False async def get_offline_msg_of_userid(userid): "获取userid对应的离线消息" return await redis_query('lrange',LIST_USERMESSAGES_OF_USERID % userid,0,-1) async def push_offline_msg(writer,*args): """推送离线消息""" userid=int(args[0]) user_offline_messages_list=await get_offline_msg_of_userid(userid) for user_offline_message in user_offline_messages_list: await write_data(writer,user_offline_message.decode('utf-8')) await redis_query('del',LIST_USERMESSAGES_OF_USERID % userid) async def update_auth(writer,*args): """更新auth_userid_map_wstream和auth_wstream_map_userid""" userid=int(args[0]) auth_userid_map_wstream[userid]=writer auth_wstream_map_userid[writer]=userid async def check_args_validity(writer,cmd,*args): if cmd == 'auth': if len(args) != 2: msg='提示:认证格式: [ auth userid pwd ]' await write_data(writer,msg) return False try: userid=int(args[0]) except: msg='userid为整数!' await write_data(writer,msg) return False elif cmd == 'msg': if len(args) < 2: msg='提示:消息格式: [ msg userid content ]' await write_data(writer,msg) return False try: to_userid=int(args[0]) except: msg='userid为整数!' await write_data(writer,msg) return False return True async def handler_client(reader,writer): """处理客户端连接""" address=writer.get_extra_info('peername') log=logging.getLogger('echo_{}_{}'.format(*address)) log.debug('收到新连接') args_ok=True cmd_ok=True while True: if not await user_is_auth(writer) and cmd_ok and args_ok: msg=u'请登陆！' await write_data(writer,msg) data=await read_data(reader) if not data: break cmd=data.split(' ')[0] args=data.split(' ')[1:] if cmd == '?' or cmd == 'help': await write_data(writer,f'{cmd_help}') cmd_ok=False continue elif cmd not in cmd_list: await write_data(writer,'输入的命令不支持,请使用 help 或 ? 查看帮助!') cmd_ok=False continue cmd_ok=True if not await user_is_auth(writer): if cmd != 'auth': continue if not await check_args_validity(writer,cmd,*args): args_ok=False continue args_ok=True if await user_auth(*args): await offline_userid(0,*args) await update_auth(writer,*args) msg='认证成功!' await write_data(writer,msg) await push_offline_msg(writer,*args) else: msg='认证失败!' await write_data(writer,msg) else: if not await check_args_validity(writer,cmd,*args): args_ok=False continue args_ok=True if cmd == 'msg': await send_user_msg(writer,*args) elif cmd == 'logout': userid=await get_userid_of_wstream(writer) await offline_userid(1,userid) elif cmd == 'auth': await write_data(writer,'您已登录！') context=ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) context.load_cert_chain(certfile=CERTFILE,keyfile=KEYFILE) factory=asyncio.start_server(handler_client,*SERVER_ADDRESS,ssl=context) server=loop.run_until_complete(factory) log.debug('starting up on {} port {}'.format(*SERVER_ADDRESS)) try: loop.run_forever() except KeyboardInterrupt: pass finally: log.debug('关闭服务器.') server.close() loop.run_until_complete(server.wait_closed()) log.debug('关闭事件循环LOOP.') loop.close() ``` ### 客户端 chat_client.py ```py #coding:utf-8 ###################### # 作者: SheYinsong # # 时间: 2019-03-30 # ###################### import asyncio from aioconsole import ainput import concurrent.futures import sys import ssl import time import struct import logging #日志配置 logging.basicConfig( level=logging.DEBUG, format='%(name)s: %(message)s', stream=sys.stderr, ) log = logging.getLogger('main') #TLS配置 CERTFILE='ssl/server.crt' #chat服务器配置 SERVER_ADDRESS = ('chat.unotes.co',10000) #证书使用的chat.unotes.co域名,根据实际证书域名填写 loop = asyncio.get_event_loop() async def write_data(writer,data): """发送数据""" b_data=bytes(data,encoding='utf-8') b_datasize=struct.pack('H',len(b_data)) try: writer.write(b_datasize+b_data) await writer.drain() return True except: return False async def read_data(reader): """读取数据""" try: length=struct.unpack('H',await reader.read(2))[0] b_data=await reader.read(int(length)) return b_data.decode('utf-8') except Exception as e: log.debug(f'读取客户端数据异常:[{e}]') return '' async def read(reader,once=False): """循环读取数据""" while True: data=await read_data(reader) if data: if once: print(f'{data}\n',end='') break print(f'\x08\x08\x08{data}\n>>>',end='') async def chat_client(address): """聊天客户端""" log = logging.getLogger('chat_client') log.debug('connecting to {} port {}'.format(*address)) context=ssl.create_default_context(ssl.Purpose.SERVER_AUTH) context.load_verify_locations(CERTFILE) reader, writer = await asyncio.open_connection(*address,ssl=context) await read(reader,True) read_task=asyncio.create_task(read(reader)) while True: data=await ainput('>>>') if not data: data='?' await write_data(writer,data) try: loop.run_until_complete( chat_client(SERVER_ADDRESS) ) except: pass finally: log.debug('closing event loop') loop.close() ``` ### 运行服务器 ```bash $ python chat_server_asyncio_tls.py ``` ``` asyncio: Using selector: EpollSelector main: starting up on 0.0.0.0 port 10000 ``` ### 运行客户端1 #### 用户张三 ```bash python chat_client_asyncio_tls.py ``` ``` 请登陆！ >>>auth 10001 123456 认证成功! >>>msg 10002 你好，我是张三 [离线消息] 2019-03-29 17:00:24 [ 张三|UID:10001 ]:你好，我是张三 >>> ``` ### 运行客户端2 #### 用户李四 ```bash python chat_client_asyncio_tls.py ``` ``` 请登陆！ >>>auth 10002 123456 认证成功! 2019-03-29 17:00:24 [ 张三|UID:10001 ]:你好，我是张三 >>>msg 10001 你好，张三，我收到您的消息 2019-03-29 17:02:52 [ 李四|UID:10002 ]:你好，张三，我收到您的消息 >>> ``` #### 查看客户端1 ### 用户张三 ```bash $ python chat_client_asyncio_tls.py ``` ``` 请登陆！ >>>auth 10001 123456 认证成功! >>>msg 10002 你好，我是张三 [离线消息] 2019-03-29 17:00:24 [ 张三|UID:10001 ]:你好，我是张三 2019-03-29 17:02:52 [ 李四|UID:10002 ]:你好，张三，我收到您的消息 ```

#### 分析微信认证名中一般都会附带地区名，通过数据库的省市县名和微信名做存在判断（```注意:此方法存在一定概率的误判```） #### 准备需要省、市、县数据表的数据，需要的话请联系博主 #### 步骤 1.判断对应的省名是否在微信认证名中，如果在，则匹配出数据,并且判断如果是直辖市，获取对应的市级名字,id 2.如果省没匹配成功，则匹配市级数据的名字，如果成功，则获取对应的省名及对应省id 3.如果省市都没匹配成功，则匹配县区级数据的名字，如果成功，则获取对应的省市名及对应省市id ##### 数据表结构 ```sql CREATE TABLE `province` ( `id` bigint(19) NOT NULL AUTO_INCREMENT, `name` varchar(32) NOT NULL COMMENT '省份名称', `short_name` varchar(32) DEFAULT NULL COMMENT '省份简称', `remark` varchar(255) DEFAULT NULL COMMENT '备注', `created_at` datetime NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=36 DEFAULT CHARSET=utf8; CREATE TABLE `city` ( `id` bigint(19) NOT NULL AUTO_INCREMENT COMMENT '标识', `name` varchar(32) NOT NULL COMMENT '城市名称', `short_name` varchar(32) DEFAULT NULL COMMENT '城市简称', `province_id` bigint(19) NOT NULL COMMENT '所属省份标识', `level` int(10) NOT NULL COMMENT '城市等级（0未知，1:一线，2：二线，3：三线，4：四线）', `remark` varchar(255) DEFAULT NULL COMMENT '备注', `created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', PRIMARY KEY (`id`), UNIQUE KEY `name` (`name`,`province_id`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=7471 DEFAULT CHARSET=utf8; CREATE TABLE `district` ( `id` bigint(19) NOT NULL AUTO_INCREMENT COMMENT '标识', `name` varchar(32) NOT NULL COMMENT '区县名称', `short_name` varchar(32) DEFAULT NULL COMMENT '区县简称', `city_id` bigint(19) NOT NULL COMMENT '所属城市标识', `remark` varchar(255) DEFAULT NULL COMMENT '备注', `created_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', PRIMARY KEY (`id`), UNIQUE KEY `name` (`name`,`city_id`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=10069 DEFAULT CHARSET=utf8; ``` ##### python脚本 ```python vim get_loc.py ``` ```python #coding:utf-8 import pymysql db_host="xx.xx.xx.xx" db_user="root" db_pass="xxxxx" db_port=3306 db_name="xxxx" #微信认证名 file=open("weixin_auth.txt",'r',encoding="utf-8") def execute_query_sql(sql): #循环读取数据库状态是0的关键字100个 db= pymysql.connect(host=db_host,port=db_port,user=db_user, passwd=db_pass, db=db_name) # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() #执行sql cursor.execute(sql) results=cursor.fetchall() # 关闭数据库连接 db.close() return results n=1 for f in file.readlines(): #获取省 pro_sql='select id,name from province;' result=execute_query_sql(pro_sql) is_has_pro=False pro_id=-1 city_id=-1 district_id=-1 pro_name="" for result_entry in result: if result_entry[1][0:2] in f: is_has_pro=True pro_name=result_entry[1] pro_id=result_entry[0] #这里写死id if pro_name=="北京市": city_id=7151 if pro_name=="上海市": city_id=7122 if pro_name=="天津市": city_id=7182 if pro_name=="重庆市": city_id=7430 break #获取市 is_has_city=False city_name="" if not is_has_pro: city_sql='select id,name from city where province_id>0;' result=execute_query_sql(city_sql) for result_entry in result: if result_entry[1].replace("市","").replace("县","") in f: is_has_city=True city_name=result_entry[1] city_id=result_entry[0] #获取省id pro_id_sql='select province_id from city where id='+str(city_id)+';' result=execute_query_sql(pro_id_sql) pro_id=result[0][0] break #获取区县 is_has_district=False district_name="" if not is_has_city and not is_has_pro: district_sql='select id,name from district where city_id>0;' result=execute_query_sql(district_sql) for result_entry in result: if result_entry[1].replace("市","").replace("县","") in f: is_has_district=True district_name=result_entry[1] district_id=result_entry[0] #获取市id city_id_sql='select city_id from district where id='+str(district_id)+';' result=execute_query_sql(city_id_sql) city_id=result[0][0] #获取省id pro_id_sql='select province_id from city where id='+str(city_id)+';' result=execute_query_sql(pro_id_sql) pro_id=result[0][0] break #打印对应的id #获取名字 if pro_id>0: sql='select name from province where id='+str(pro_id)+';' result=execute_query_sql(sql) pro_name=result[0][0] if city_id>0: sql='select name from city where id='+str(city_id)+';' result=execute_query_sql(sql) city_name=result[0][0] if district_id >0: sql='select name from district where id='+str(district_id)+';' result=execute_query_sql(sql) district_name=result[0][0] n+=1 print("当前执行到第"+str(n)+"行") print(f.strip()+","+str(pro_id)+","+str(city_id)+","+str(district_id)+","+pro_name+","+city_name+","+district_name+"\n") #把数据保存在文件中 file2=open("weixin_auth_loc.txt",'a',encoding="utf-8") file2.write(f.strip()+","+str(pro_id)+","+str(city_id)+","+str(district_id)+","+pro_name+","+city_name+","+district_name+"\n") file2.close() ```

### 本文讲解通过python selenium firefox mysql的方式爬取搜狗微信公众号数据 `说明:搜狗微信的反爬虫，scrapy框架爬取易被检测,使用selenium的方式（缺点：慢。优点:不易被检测到）。` 安装相关软件教程参考:https://ynotes.cn/blog/article_detail/158 #### 流程: 1.脚本循环查询关键字表(table keys)中关键字类型字段(column type)所对应的关键字字段(column keyword)前100条数据 2.通过获取关键字循环去搜狗微信去搜索 3.爬取搜狗搜索出来的微信公众号 4.判断页面是否有分页，有则循环爬取。爬取完一个页面，更新爬取页面数字段(column page_num),所有页面更新关键字表的状态字段(column status[0:表示未爬取,1:表示已爬取]) 5.对爬取出来的数据插入到微信公众号数据表(weixin_data)(建相关数据表) 6.更新关键字表的状态为已爬取状态 #### 数据表结构 ```sql CREATE TABLE `keys` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `keyword` varchar(255) DEFAULT NULL, `page_num` int(11) DEFAULT '0', `status` int(11) DEFAULT '0' COMMENT '0 未搜索 1 已搜索 99 丢弃', `type` varchar(255) DEFAULT NULL, `is_drop` int(11) NOT NULL DEFAULT '0', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=119750 DEFAULT CHARSET=utf8; CREATE TABLE `weixin_data` ( `id` int(10) unsigned NOT NULL AUTO_INCREMENT, `key_id` int(255) DEFAULT NULL, `weixin_name` varchar(255) DEFAULT NULL, `weixin_account` varchar(255) DEFAULT NULL, `weixin_auth_info` varchar(255) DEFAULT NULL, `is_auth` int(11) DEFAULT NULL, `describe` varchar(6000) DEFAULT NULL, `img_url` varchar(255) DEFAULT NULL, `loc_info` varchar(255) DEFAULT NULL, `privince` varchar(255) DEFAULT NULL, `city` varchar(255) DEFAULT NULL, `district` varchar(255) DEFAULT NULL, `weixin_type` varchar(255) DEFAULT NULL, `other` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `weixin_account` (`weixin_account`) ) ENGINE=InnoDB AUTO_INCREMENT=139746 DEFAULT CHARSET=utf8; ``` #### 爬虫脚本 scrapy_sogou.py ```python #coding=utf-8 from selenium import webdriver import time from selenium.common.exceptions import NoSuchElementException,TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import pymysql import random 指定页面 #参数指定了缓存文件的路径，方便爬取需要登录的网站 profile = webdriver.FirefoxProfile(r'C:\Users\Administrator.GZLX-20180416SV\AppData\Roaming\Mozilla\Firefox\Profiles\yn80ouvt.default') #如果不需要cookie，则不需要指定,使用下面的配置 #profile = webdriver.FirefoxProfile() #禁止加载样式表 profile.set_preference("permissions.default.stylesheet",2) #禁止加载图片 profile.set_preference("permissions.default.image",2) #禁止加载JAVASCRIPT profile.set_preference("javascript.enabled",False) #设置代理 profile.set_preference('network.proxy.type', 1) profile.set_preference('network.proxy.http', 'xx.xx.xx.xx') profile.set_preference('network.proxy.http_port', xxxx) profile.set_preference('network.proxy.ssl', 'xx.xx.xx.xx') profile.set_preference('network.proxy.ssl_port', xxxx) profile.update_preferences() #数据库配置 db_host="xx.xx.xx.xx" db_user="root" db_pass="xxxx" db_port=3306 db_name="weixin_data" #指定Firefox的驱动 driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver") #搜索的关键字 key_search_list=['学校'] index_url='https://weixin.sogou.com/weixin?query=' keys_search_string="" for index in range(0,len(key_search_list)): if index==len(key_search_list)-1: keys_search_string+="'"+key_search_list[index]+"'" else: keys_search_string+="'"+key_search_list[index]+"'," class AnyEc: """ Use with WebDriverWait to combine expected_conditions in an OR. """ def __init__(self, *args): self.ecs = args def __call__(self, driver): for fn in self.ecs: try: if fn(driver): return True except: pass def execute_query_sql(sql): #循环读取数据库状态是0的关键字100个 db= pymysql.connect(host=db_host,port=db_port,user=db_user, passwd=db_pass, db=db_name) # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() #执行sql cursor.execute(sql) results=cursor.fetchall() # 关闭数据库连接 db.close() return results def execute_update_sql(sql): #循环读取数据库状态是0的关键字100个 db= pymysql.connect(host=db_host,port=db_port,user=db_user, passwd=db_pass, db=db_name) # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() # 执行sql语句 cursor.execute(sql) # 提交到数据库执行 db.commit() #执行sql # 关闭数据库连接 db.close() #爬取网站内容的函数 def parseWeb(driver,key_name,page_N): print("开始提取关键字"+key_name+",第"+str(page_N)+"页的数据:") for page in driver.find_elements_by_xpath('//ul[@class="news-list2"]/li'): weixin_name=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="tit"]/a').text img_url=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="img-box"]/a/img').get_attribute("src") weixin_account=page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="info"]/label').text weixin_auth_info="" try: page.find_element_by_xpath('./dl[2]/dt[contains(text(),微信认证)]') dl_info=page.find_element_by_xpath('./dl[2]/dt').text if '微信认证' in dl_info: weixin_auth_info=page.find_element_by_xpath('.//dl[2]/dd').text except NoSuchElementException: weixin_auth_info="" print("微信认证:"+weixin_auth_info) try: page.find_element_by_xpath('./div[@class="gzh-box2"]/div[@class="txt-box"]/p[@class="tit"]/i') is_auth=1 except NoSuchElementException: is_auth=0 try: describe=page.find_element_by_xpath('.//dl[1]/dd').text except NoSuchElementException: describe="" #把数据插入酷内 insert_sql='insert into weixin_data(key_id,weixin_name,weixin_account,weixin_auth_info,is_auth,img_url,`describe`,weixin_type,other) values('+str(key_id)+',"'+weixin_name+'","'+weixin_account+'","'+weixin_auth_info+'",'+str(is_auth)+',"'+img_url+'","'+describe+'","'+'培训机构'+'",'+'NULL'+'); ' #print(insert_sql) try: print("准备插入数据:"+weixin_name) execute_update_sql(insert_sql) except: print("插入数据异常,可能是重复数据") #更新当前页数 update_sql='update `keys` set page_num='+str(page_N)+' where keyword="'+key_name+'";' print(update_sql) try: execute_update_sql(update_sql) except: print("更新爬取页数错误") return False return True #判断页面是否加载完成 def pageIsLoadFinished(driver): try: WebDriverWait(driver, 10).until( AnyEc( EC.presence_of_element_located( (By.XPATH, u'//div[@class="gzh-box2"]/div[@class="img-box"]/a/img')), EC.presence_of_element_located( (By.XPATH, u'//p[@class="ip-time-p"]')), EC.presence_of_element_located( (By.XPATH, u'//div[@id="noresult_part1_container"]')) )) return True except TimeoutException: return False #页面是否正常 def pageIsNomal(driver): try: driver.find_element_by_xpath('//p[@class="ip-time-p"]') print("IP访问频繁，准备重启浏览器") time.sleep(3) return False except NoSuchElementException: return True #页面是否404 def pageIsNotFound(driver,key_name): try: driver.find_element_by_xpath('//div[@id="noresult_part1_container"]') print("关键字"+key_name+"没有找到，搜索下一个关键字") return True except NoSuchElementException: return False #跳到指定页 def jumpNumPage(driver,page_N): #判断是否是当前页 try: current_page=driver.find_element_by_xpath('//div[@id="pagebar_container"]/span').text if int(page_N) == int(current_page): print("已经在当前页，无需跳转") return True except: print("没有当前页"+str(page_N)) return False try: driver.find_element_by_xpath('//div[@id="pagebar_container"]/a[@id="sogou_page_'+str(page_N)+'"]').click() except NoSuchElementException: print("没有第"+str(page_N)+"页面") return False return True #跳到下一页 def jumpNextPage(driver): try: driver.find_element_by_xpath('//div[@id="pagebar_container"]/a[@id="sogou_next"]').click() except NoSuchElementException: print("没有下一页") return False return True #页面是否准备好 def PageIsReady(driver,key_name,page_N): #判断页面已经加载完成，并且不存在ip频繁访问页面 if pageIsLoadFinished(driver) and pageIsNomal(driver): #判断页面不存在指定的标签页 if not jumpNumPage(driver,page_N): #判断页面是否404 if pageIsNotFound(driver,key_name): #更新数据库关键字字段 update_status_sql='update `keys` set status=1 where keyword="'+key_name+'";' try: execute_update_sql(update_status_sql) except: print("更新关键字"+key_name+"的状态失败!!!") return True else: return False return True #循环抓取 while True: get_keys="SELECT id,keyword FROM keys where status=0 and is_drop=0 and type in ("+keys_search_string+") limit 100;" print(get_keys) print("获取关键字中...") try: results=execute_query_sql(get_keys) except: print("数据库查询关键字失败，停止爬虫") break print("关键字查找完成") #生成url id_keys=[ re for re in results ] for id_key in id_keys: key_id=id_key[0] key_name=id_key[1] url=index_url+key_name print("开始爬取:"+url) try: driver.get(url) except TimeoutException: continue #获取爬取key的页数 get_page_sql='select page_num from `keys` where id='+str(key_id)+';' page_N=execute_query_sql(get_page_sql)[0][0]+1 if PageIsReady(driver,key_name,page_N): if not parseWeb(driver,key_name,page_N): continue else: time.sleep(1) driver.close() driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver") continue #跳转到当前页爬取 #爬取完当前页更新key关键字 #判断是否有下一页继续爬取,如果只爬取一页，则注释下面的代码 ##########是否爬取搜索关键字的所有页面--start isOk=True while jumpNextPage(driver): get_page_sql='select page_num from `keys` where id='+str(key_id)+';' page_N=execute_query_sql(get_page_sql)[0][0]+1 if PageIsReady(driver,key_name,page_N): if not parseWeb(driver,key_name,page_N): isOk=False break else: isOk=False break if not isOk: time.sleep(1) driver.close() driver = webdriver.Firefox(firefox_profile=profile,executable_path="geckodriver") continue ##########是否爬取搜索关键字的所有页面--end #更新关键字状态 update_status_sql='update `keys` set status=1 where keyword="'+key_name+'";' try: execute_update_sql(update_status_sql) except: print("更新数据库关键字"+key_name+"的状态发生错误") ```

安装django-smtp-ssl ```bash $ pip install django-smtp-ssl ``` 修改settings.py的 ```py EMAIL_BACKEND = 'django_smtp_ssl.SSLEmailBackend' EMAIL_HOST = 'mail.example.com' EMAIL_PORT = 465 ```