上一篇文章说到了用python把QQ空间的说说爬取下来,用python爬取QQ空间说说
今天,我要用python把QQ空间所有的说说都删除掉。动机就不说了。
整体的步骤跟爬取说说的步骤差不多。
算了,不想写,直接上传代码吧。遇到了一个问题,删除一百多条说说后,腾讯就要验证码了,验证码识别太麻烦了,就不弄了。
源文件:clear_qzone.py
#coding=utf-8
#导入selenium2中的webdriver库
from selenium.webdriver.support.select import Select
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
from PIL import Image
from PIL import ImageOps
import time
import re
import lxml
import sys
import subprocess
import urllib
fp = open('log.txt', 'a')
def get_time():
return time.strftime('[%Y-%m-%d %H:%M:%S] ',time.localtime(time.time()))
#要爬取的QQ空间
qq = '123456789'
#登录的QQ号
myqq = '123456789'
#登录QQ密码
passwd = '123456'
def quit(driver):
with open('flag.txt', 'w') as f:
f.write('False')
fp.close()
driver.quit()
sys.exit()
def login():
#实例化出一个Firefox浏览器
option=webdriver.ChromeOptions()
option.add_argument("test-type")
driver = webdriver.Chrome(chrome_options=option)
#设置浏览器窗口的位置和大小
driver.set_window_position(20, 40)
driver.set_window_size(1100,700)
#driver.maximize_window()
#打开一个页面(QQ空间登录页)
driver.get('http://user.qzone.qq.com/%s/311'%qq)
#登录表单在页面的框架中,所以要切换到该框架
time.sleep(1)
try:
driver.switch_to_frame('login_frame')
#通过使用选择器选择到表单元素进行模拟输入和点击按钮提交
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys(myqq)
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys(passwd)
driver.find_element_by_id('login_button').click()
time.sleep(3)
driver.switch_to_frame('app_canvas_frame')
time.sleep(1)
except:
fp.write(get_time() + '出师不利,登录失败\n')
quit(driver)
return driver
def cleanImage(imagePath):
image = Image.open(imagePath) #打开图片
image = image.point(lambda x: 0 if x<143 else 255) #处理图片上的每个像素点,使图片上每个点“非黑即白”
borderImage = ImageOps.expand(image,border=20,fill='white')
borderImage.save(imagePath)
def getAuthCode(driver, url):
captchaUrl = url
#driver.get(captchaUrl)
time.sleep(0.5)
#driver.save_screenshot("captcha.jpg") #截屏,并保存图片
img_name = 'captcha.png'
urllib.urlretrieve(captchaUrl, img_name)
time.sleep(0.5)
cleanImage(img_name)
p = subprocess.Popen(["tesseract", img_name, "captcha", "-psm 7"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
try:
with open("captcha.txt", "r") as f:
captchaResponse = f.read().replace(" ", "").replace("\n", "")
print "验证码: " ,captchaResponse, '长度:', len(captchaResponse),
if len(captchaResponse) == 4:
return captchaResponse
else:
return False
except IOError:
return False
def get_page(driver):
#f.write '获取html源码'
return driver.page_source
def delete_shuoshuo(bd, driver):
#获取说说文字部分
if bd is None:
return
item = bd.find('a', class_='c_tx c_tx3 goDetail').get('title')
print item,
more = driver.find_element_by_css_selector("[class='dropdown more-edit-items']")
driver.execute_script("""
var element = arguments[0];
element.setAttribute('class', 'dropdown more-edit-items dropdown-open');
""", more)
time.sleep(1)
#delete = driver.find_element_by_css_selector("[class='del del_btn author_display']")
try:
delete = driver.find_element_by_link_text('删除')
#print 'delete:',delete
if delete is None:
print 'delete is None'
return
delete.click()
except:
fp.write(get_time() + '无法获取删除按钮\n')
quit(driver)
time.sleep(1)
driver.switch_to_default_content()
try:
yes = driver.find_element_by_link_text('是')
if yes is None:
print 'yes is None'
return
yes.click()
except:
fp.write(get_time() + '无法获取删除对话框\n')
quit(driver)
#判断是否需要输入验证码
#try:
time.sleep(1)
try:
driver.switch_to_frame('verify_dialog_frame')
except:
print '不需要验证码 删除成功'
fp.write(get_time() + item + '删除成功\n')
driver.switch_to_frame('app_canvas_frame')
return
print '需要验证码',
fp.write(get_time() + '需要验证码,暂时退出\n')
quit(driver)
img_page = get_page(driver)
soup = BeautifulSoup(img_page,'lxml')
time.sleep(1)
img = soup.find('div', id='web_verify').find('img', id='verifyImg').get('src')
if img is None:
print '无法获取验证码地址',
driver.find_element_by_css_selector("[class='spr bt_tip_normal']").click()
driver.switch_to_frame('app_canvas_frame')
return
#print img
code = getAuthCode(driver, img)
if code:
driver.find_element_by_id('verifyInput').clear()
driver.find_element_by_id('verifyInput').send_keys(code)
driver.find_element_by_css_selector("[class='spr bt_tip_over']").click()
print '删除成功'
else:
print '无法识别验证码'
driver.find_element_by_css_selector("[class='spr bt_tip_normal']").click()
time.sleep(0.5)
driver.switch_to_default_content()
time.sleep(0.5)
driver.switch_to_frame('app_canvas_frame')
return
if __name__ == '__main__':
with open('flag.txt', 'r') as f:
if f.readline() == 'True':
fp.write(get_time() + '另一个程序正在运行\n')
fp.close()
sys.exit()
with open('flag.txt', 'w') as f:
f.write('True')
driver = login()
try:
target = driver.find_element_by_id("pager_next_0")
driver.execute_script("arguments[0].scrollIntoView();", target)
driver.find_element_by_id('pager_last_0').click()
time.sleep(2)
pages = get_page(driver)
soup = BeautifulSoup(pages,'lxml')
total_page = soup.find('span', class_='current').get_text()
ol = soup.find('ol', id='msgList')
except:
fp.write(get_time() + '页面加载失败,退出\n')
quit(driver)
page = 1
while True:
msgList = ol.find_all('li', class_='feed')
print 'page: ' + str(page) + ' msgList: ' + str(len(msgList))
for post in msgList:
#调用获取说说文字部分函数
delete_shuoshuo(post.find('div', class_='ft'), driver)
time.sleep(1)
#获取当前页面,。如果是最后一页,则结束while循环
try:
cur_page = soup.find('span', class_='current').find('span').get_text()
if int(cur_page) == 1:#total_page:
fp.write(get_time() + '删除完成\n')
driver.quit()
sys.exit()
except:
fp.write(get_time() + '获取当前页码失败\n')
quit(driver)
#否则进行翻页操作
try:
driver.find_element_by_id('pager_previous_%d'%(page)).click()
time.sleep(2)
pages = get_page(driver)
soup = BeautifulSoup(pages,'lxml')
ol = soup.find('ol', id='msgList')
except:
fp.write(get_time() + '翻页失败%d\n'%page)
quit(driver)
page += 1
time.sleep(1)
#退出窗口
quit(driver)