<input type="hidden" name="lsd" value="AVpI6yeG" autocomplete="off" /><input type="hidden" name="lgnrnd" value="171524_eEMy" /><input type="hidden" id="lgnjs" name="lgnjs" value="n" /></span>
使用正则语句将value值提取出即可,Cookie值使用cookiejar存储即可,方便下一步的使用。代码如下:
def login_first_step(self): sent_url = 'https://www.facebook.com' request = urllib2.Request(url=sent_url,headers=self.facebook_header) content=self.opener.open(request) html = content.read() lsd=lgndim=lgnjs=lgnrnd='' # 查找lsd reg = r'<input type="hidden" name="lsd" value="([A-Za-z0-9]*)" autocomplete="off" />' m = re.compile(reg) search = re.search(m,html) if search: lsd = search.group(1) # 查找lgndim lgndim可能为空 reg = r'<input type="hidden" autocomplete="off" name="lgndim" value="([A-Za-z0-9]*)"' m = re.compile(reg) search = re.search(m,html) if search: lgndim = search.group(1) # 查找lgnrnd reg = r'<input type="hidden" name="lgnrnd" value="([A-Za-z0-9]*_[A-Za-z0-9]*)" />' m = re.compile(reg) search = re.search(m,html) if search: lgnrnd = search.group(1) # 查找lgnjs reg = r'<input type="hidden" id="lgnjs" name="lgnjs" value="([A-Za-z0-9]*)" />' m = re.compile(reg) search = re.search(m,html) if search: lgnjs = search.group(1) #设置第二步中要post的值 self.login_post_values = 'lsd='+lsd+'&email='+self.email+'&pass='+self.password+ '&persistent=&default_persistent=1&timezone=&lgndim=&lgnrnd=' +lgnrnd+'&lgnjs='+lgnjs+'&locale=zh_CN&next=https%3A%2F%2Fwww.facebook.com%2F' print '-------------------------------------------' print 'lsd:',lsd print 'lgndim:',lgndim print 'lgnjs:',lgnjs print 'lgnrnd:',lgnrnd print self.cj for key in self.cj: print key.name,':',key.value print '-------------------------------------------'</span>
def login_second_step(self): sent_url = 'https://www.facebook.com/login.php?login_attempt=1&lwv=110' request = urllib2.Request(url=sent_url,headers=self.facebook_header,data=self.login_post_values) content=self.opener.open(request) print '-------------------------------------------' for key in self.cj: print key.name,':',key.value print '-------------------------------------------'</span>这一步的response是302重定向报文,python会自动向新目标https://www.facebook.com/发送一个新的请求,但是这个请求没有带上我们获取的cookie值。为了使得重定向报文得到新的cookie值,需要自己编写http_error_302()方法,参考了下面这篇博客http://www.hawkwithwind.net/blog/2013/08/13/python-urllib2-%E9%87%8D%E5%AE%9A%E5%90%91%E6%97%B6%E8%8E%B7%E5%8F%96cookie/
class RedirectHandler(urllib2.HTTPRedirectHandler): def http_error_302(self,req,fp,code,msg,headers): print '############ GOT 302 ###############' cookiemap = {} setcookie = str(headers["Set-Cookie"]) cookieTokens = ["Domain","Expires", "Path", "Max-Age",'path','domain'] tokens = setcookie.split(";") for cookie in tokens: cookie = cookie.strip() if cookie.startswith("Expires="): cookies = cookie.split(",", 2) if len(cookies) > 2: cookie = cookies[2] cookie = cookie.strip() else : cookies = cookie.split(",", 1) if len(cookies) > 1: cookie = cookies[1] cookie = cookie.strip() namevalue = cookie.split("=", 1) if len(namevalue) > 1: name = namevalue[0] value = namevalue[1] if name not in cookieTokens: cookiemap[name] = value print cookiemap str_cookie = '' for key in cookiemap: str_cookie = str_cookie + key + '=' + cookiemap[key] + '; ' str_cookie = str_cookie[:-2] print str_cookie req.add_header("Cookie", str_cookie)<span style="white-space:pre"> </span>#设置新的cookie值 return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)</span>然后,需要在urllib2包中提交请求的opener中添加我们的302处理方法,代码如下:
opener = urllib2.build_opener(httpHandler, httpsHandler,RedirectHandler, urllib2.HTTPCookieProcessor(self.cj))</span>至此,302重定向问题就解决完了,当python检测到收到重定向页面后,会执行我们编写的出错处理方法。
def login_third_step(self): sent_url = 'https://www.facebook.com' request = urllib2.Request(url=sent_url,headers=self.facebook_header) content=self.opener.open(request) print '-------------------------------------------' print content.read() print '-------------------------------------------'</span>到这里,我们已经登录到了facebook,但是只是到了欢迎页面,并查看不了“朋友圈”,因为此时,cookie中少了一个值,“datr”,这个值在step 3的response的数据中,使用相同的办法找到并提交即可,所以需要对step 3进行修改:
def login_third_step(self): sent_url = 'https://www.facebook.com' request = urllib2.Request(url=sent_url,headers=self.facebook_header) content=self.opener.open(request) # print content.read() tmp_html = content.read() #查找datr reg = r'"_js_datr","([A-Za-z0-9]*)"' m = re.compile(reg) search = re.search(m,tmp_html) datr = '' if search: datr = search.group(1) print '-------------------------------------------' print 'datr: ',datr self.cj.set_cookie(cookielib.Cookie( version=0, name='datr', value=datr, port=None, port_specified=False, domain=".facebook.com", domain_specified=True, domain_initial_dot=False, path="/", path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest=None ))然后把数据再次提交即可:
def login_fourth_step(self): sent_url = 'https://www.facebook.com' request = urllib2.Request(url=sent_url,headers=self.facebook_header) content=self.opener.open(request) print '-------------------------------------------' print ' getting html ' # print content.read() self.html = content.read() print '-------------------------------------------'
# -*- coding:gb2312 -*-__author__ = 'HYDT'import urllib2import reimport cookielibclass RedirectHandler(urllib2.HTTPRedirectHandler): def http_error_302(self,req,fp,code,msg,headers): print '############ GOT 302 ###############' cookiemap = {} setcookie = str(headers["Set-Cookie"]) cookieTokens = ["Domain","Expires", "Path", "Max-Age",'path','domain'] tokens = setcookie.split(";") for cookie in tokens: cookie = cookie.strip() if cookie.startswith("Expires="): cookies = cookie.split(",", 2) if len(cookies) > 2: cookie = cookies[2] cookie = cookie.strip() else : cookies = cookie.split(",", 1) if len(cookies) > 1: cookie = cookies[1] cookie = cookie.strip() namevalue = cookie.split("=", 1) if len(namevalue) > 1: name = namevalue[0] value = namevalue[1] if name not in cookieTokens: cookiemap[name] = value print cookiemap str_cookie = '' for key in cookiemap: str_cookie = str_cookie + key + '=' + cookiemap[key] + '; ' str_cookie = str_cookie[:-2] print str_cookie req.add_header("Cookie", str_cookie) return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)class get_html(): email = '%2B<span style="font-family: Arial, Helvetica, sans-serif;">86185xxxxxxxx</span><span style="font-family: Arial, Helvetica, sans-serif;">' #用户名 +86185xxxxxxxx 注意加号改成url编码 %2B</span> password = '' #密码 cj = cookielib.CookieJar() login_post_values = '' html = '' facebook_header = { "Connection":"close", "Cache-Control":"max-age=0", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Origin":"https://www.facebook.com", "Upgrade-Insecure-Requests":" 1", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/45.0.2454.101 Safari/537.36", "Content-Type":"application/x-www-form-urlencoded", "Referer":"https://www.facebook.com/", "Accept-Language":"zh-CN,zh;q=0.8" } opener = urllib2.build_opener() def get_opener(self): # self.cj = self.cj.clear() httpHandler = urllib2.HTTPHandler(debuglevel=1) httpsHandler = urllib2.HTTPSHandler(debuglevel=1) opener = urllib2.build_opener(httpHandler, httpsHandler,RedirectHandler, urllib2.HTTPCookieProcessor(self.cj)) return opener def login_first_step(self): sent_url = 'https://www.facebook.com' request = urllib2.Request(url=sent_url,headers=self.facebook_header) content=self.opener.open(request) html = content.read() lsd=lgndim=lgnjs=lgnrnd='' # 查找lsd reg = r'<input type="hidden" name="lsd" value="([A-Za-z0-9]*)" autocomplete="off" />' m = re.compile(reg) search = re.search(m,html) if search: lsd = search.group(1) # 查找lgndim lgndim可能为空 reg = r'<input type="hidden" autocomplete="off" name="lgndim" value="([A-Za-z0-9]*)"' m = re.compile(reg) search = re.search(m,html) if search: lgndim = search.group(1) # 查找lgnrnd reg = r'<input type="hidden" name="lgnrnd" value="([A-Za-z0-9]*_[A-Za-z0-9]*)" />' m = re.compile(reg) search = re.search(m,html) if search: lgnrnd = search.group(1) # 查找lgnjs reg = r'<input type="hidden" id="lgnjs" name="lgnjs" value="([A-Za-z0-9]*)" />' m = re.compile(reg) search = re.search(m,html) if search: lgnjs = search.group(1) #设置第二步中要post的值 self.login_post_values = 'lsd='+lsd+'&email='+self.email+'&pass='+self.password+ '&persistent=&default_persistent=1&timezone=&lgndim=&lgnrnd=' +lgnrnd+'&lgnjs='+lgnjs+'&locale=zh_CN&next=https%3A%2F%2Fwww.facebook.com%2F' print '-------------------------------------------' print 'lsd:',lsd print 'lgndim:',lgndim print 'lgnjs:',lgnjs print 'lgnrnd:',lgnrnd print self.cj for key in self.cj: print key.name,':',key.value print '-------------------------------------------' def login_second_step(self): sent_url = 'https://www.facebook.com/login.php?login_attempt=1&lwv=110' request = urllib2.Request(url=sent_url,headers=self.facebook_header,data=self.login_post_values) content=self.opener.open(request) print '-------------------------------------------' for key in self.cj: print key.name,':',key.value print '-------------------------------------------' def login_third_step(self): sent_url = 'https://www.facebook.com' request = urllib2.Request(url=sent_url,headers=self.facebook_header) content=self.opener.open(request) # print content.read() tmp_html = content.read() #查找datr reg = r'"_js_datr","([A-Za-z0-9]*)"' m = re.compile(reg) search = re.search(m,tmp_html) datr = '' if search: datr = search.group(1) print '-------------------------------------------' print 'datr: ',datr self.cj.set_cookie(cookielib.Cookie( version=0, name='datr', value=datr, port=None, port_specified=False, domain=".facebook.com", domain_specified=True, domain_initial_dot=False, path="/", path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest=None )) def login_fourth_step(self): sent_url = 'https://www.facebook.com' request = urllib2.Request(url=sent_url,headers=self.facebook_header) content=self.opener.open(request) print '-------------------------------------------' print ' getting html ' # print content.read() self.html = content.read() print '-------------------------------------------' def get_proxy(self): proxy = {'http':'http://127.0.0.1:1080'} proxy_support = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) print "#########Open Proxy!##########" def __init__(self): self.get_proxy() self.cj = cookielib.CookieJar() self.opener = self.get_opener() self.login_first_step() self.login_second_step() self.login_third_step() self.login_fourth_step()# get_html()
def get_cookies(self): print "==============================" print " Geting Cookies! " print "==============================" #选择浏览器 browser = webdriver.Firefox() # browser.implicitly_wait(10) # browser.set_window_size(0,0) #访问facebook网页 browser.get('https://www.facebook.com/') #输入账户密码 browser.find_element_by_id('email').clear() browser.find_element_by_id('email').send_keys(self.email) browser.find_element_by_id('pass').clear() browser.find_element_by_id('pass').send_keys(self.password) #模拟点击登录按钮,两种不同的点击方法。。。 try: browser.find_element_by_xpath('//button[@id="loginbutton"]').send_keys(Keys.ENTER) except: browser.find_element_by_xpath('//input[@tabindex="4"]').send_keys(Keys.ENTER) # time.sleep(10) browser.find_element_by_xpath('//a[@href="https://www.facebook.com/?ref=logo"]').send_keys(Keys.ENTER) # browser.file_detector_context('Facebook').send_keys(Keys.ENTER) #获取cookie cookies = browser.get_cookies() #关闭浏览器 browser.close() </span>
联系客服