爬虫-聊天机器人语料收集

Eajack Lau 10月 06, 2019

最近被分配弄聊天机器人（但也是很人工、暴力的做法。。），只做其中很小的一部分，入门阶段。其间，被分了写个爬虫爬小爱、小冰的聊天回复（给定提问语料），问句都是单轮对话。

因此，记录下小爱和小冰的爬虫py代码，以后或许会用到🙃。

1. 小爱机器人-回复爬虫

#!\usr\bin\env python
# -*- coding: utf-8 -*-
'''
Author: Eajack
date:2019/10/6
Function：
    小爱机器人-语料回复爬虫
Attention：
    实测，正常情况下，1000~2000条输入会断1次；需要换Cookie/ip等；默认1个句子问3次
'''

import requests
import re, time, json

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': 'XISESSIONID=1baeeit97b4ovtjaydvpmlj41',#cookie根据自己浏览器F12改
    'Host': 'nlp.xiaoi.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}

def get_reply(input):
    url = "http://nlp.xiaoi.com/robot/webrobot?&callback=__webrobot_processMsg&data=%7B%22sessionId%22%3A%22a11f40dc641f4ccbb1d5d9a22a137083%22%2C%22robotId%22%3A%22webbot%22%2C%22userId%22%3A%227e90f929650684844babfe4bb629e7f1c%22%2C%22body%22%3A%7B%22content%22%3A%22{}%22%7D%2C%22type%22%3A%22txt%22%7D".format(input)
    r = requests.get(url, headers=headers)
    reply_all = r.text.split('__webrobot_processMsg')[-1]
    print(reply_all)
    reply_all = reply_all.replace('__webrobot_processMsg','')[1:-2]
    print(url)
    reply_all = json.loads(reply_all)

    try:
        reply = reply_all['body']['content'].strip()
    except KeyError:
        reply = reply_all['body']['data'].strip()

    return reply_all, reply

def main():
    with open('小爱-输入.txt', 'r', encoding='utf-8') as readFile:
        file_all = open('小爱-输出(所有信息).txt', 'w', encoding='utf-8')
        file_reply = open('小爱-输出.txt', 'w', encoding='utf-8')

        for line in readFile:
            input = line.strip()
            for i in range(3):#默认1个句子问3次
                print('============================================')
                input = input.replace(r'&', r'%26')#可去，此处因为url编码把输入“&”搞乱，下%同
                input = input.replace(r'%', r'%25')
                reply_all, reply = get_reply(input)
                reply_all_str = json.dumps(reply_all, ensure_ascii=False, \
                                           sort_keys=True, indent=4, separators=(',', ':')).strip()
                print(reply_all_str)
                print(reply)
                print('============================================')
                reply = re.sub(r'\s+', ' ', reply)

                file_all.write('{}\n\n'.format(reply_all_str))
                file_reply.write('{}&&{}\n'.format(input, reply))
                time.sleep(2)

            time.sleep(1)

        file_all.close()
        file_reply.close()

if __name__ == '__main__':
    main()

2. 小冰机器人-回复爬虫

小冰的有点麻烦，没找到公开的网址形式API接口。此处用的是微博小冰接口，需要登陆微博，用selenium模拟登陆，正常情况下，爬虫不会断。期间需要手动操作2次：（1）微博登陆有，代码time.sleep(20) 用于手动按验证码（2）私信界面后同sleep，用于点击小冰私信窗口。后面全自动收集

#!\usr\bin\env python
# -*- coding: utf-8 -*-
'''
Author: Eajack
date:2019/10/6
Function：
    小冰机器人-语料回复爬虫
Attention：
    此处用的是微博小冰接口，需要登陆微博，用selenium模拟登陆，正常情况下，爬虫不会断。期间需要手动操作2次：（1）微博登陆有，代码time.sleep(20) 用于手动按验证码（2）私信界面后同sleep，用于点击小冰私信窗口。后面全自动收集
    默认1个句子问3次
'''

import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import json,os,re,time

weiboNum = '*******'#自己的微博账号
WeiboPassword = '*******'#自己的微博密码

def main():
    #微博登录
    global content_before_list
    weibo_loginUrl = r'https://passport.weibo.cn/signin/login'

    #1- 先打开登陆界面
    driver = webdriver.Chrome()
    time.sleep(5)
    driver.maximize_window()    #窗口最大化
    driver.implicitly_wait(10)  # 隐式等待
    driver.get(weibo_loginUrl)

    ## 模拟登录
    driver.find_element_by_id('loginName').clear()
    driver.find_element_by_id('loginName').send_keys(weiboNum)
    driver.find_element_by_id('loginPassword').clear()
    driver.find_element_by_id('loginPassword').send_keys(WeiboPassword)
    time.sleep(5)
    driver.find_element_by_id('loginAction').click()
    time.sleep(20)

    #2- 打开小冰私聊界面
    weibo_XiaoBingUrl = r'https://api.weibo.com/chat/#/chat?source_from=5'
    # 打开新Tab
    js = r" window.open(' " + weibo_XiaoBingUrl + r"')"  # 可以看到是打开新的标签页，不是窗口
    driver.execute_script(js)
    time.sleep(5)
    # 窗口切换
    # 获取打开的多个窗口句柄
    windows = driver.window_handles
    # 切换到当前最新打开的窗口
    driver.switch_to.window(windows[-1])

    # sleep 人工点击小冰
    time.sleep(20)

    with open('小冰-输入.txt', 'r', encoding='utf-8') as readFile:
        ask2Reply = []
        flag = False

        for line in readFile:
            input = line.strip()
            for i in range(3):#默认1个句子问3次
                print('============================================')

                try:
                    # 输入文字
                    driver.find_element_by_id('webchat-textarea').clear()
                    driver.find_element_by_id('webchat-textarea').send_keys(input)
                    driver.find_element_by_id("webchat-textarea").send_keys(Keys.ENTER)
                    time.sleep(5)

                    html = driver.page_source
                    html_bs = BeautifulSoup(html, 'lxml')
                    content_now_list = html_bs.find_all("p", class_="puretext font14 c333 wordbreak")
                    content_now_list = [ item.get_text() for item in content_now_list ]

                    input_index = 0
                    for i in range(len(content_now_list)-1, -1, -1):
                        if content_now_list[i] == input:
                            input_index = i
                            break

                    reply_str = ' '.join(content_now_list[input_index+1:])
                    ask2Reply.append([input, reply_str])
                except Exception as e:
                    flag = True
                    break

            if(flag):
                break

        with open('小冰-输出.txt', 'a', encoding='utf-8') as writeFile:
            for item in ask2Reply:
                writeFile.write('{}&&{}\n'.format(item[0], item[1]))



if __name__ == '__main__':
    main()

3. 输出语料格式

以上代码输出语料格式一致，均如下

……
讲个笑话&&老板：“你物流专业毕业的？”我：“是的。”老板：“很好，你马上去帮我把这个快递寄了。”
讲个笑话&&我不！该你讲了！
讲个笑话&&妈妈上班请假回家，要带三岁的女儿去逛街。出门前妈妈对女儿说：快向保姆阿姨说Bye-Bye。女儿照说Bye-Bye，当妈妈又说：向阿姨亲一个。女儿带着恐惧的眼神，极力的摇头不肯亲阿姨。妈妈带着生气的语说：为什么不亲？女儿仍带着恐惧的语气大声说：爸爸早上偷亲阿姨后，结果被打的好惨！
么么哒&&么么~么么
么么哒&&么么么么么么么么么么么么么么么么么哒
么么哒&&么么哒~要不要来个唇膏
……