Python3 爬虫的基本语法及异常处理（万能爬虫）

标题：Python3 爬虫的基本语法及异常处理（万能爬虫）

-------------------------------------------------------------------------------------------------------------------------------

时间：2018/5/16 16:04:13

-------------------------------------------------------------------------------------------------------------------------------

内容：

def urltohtml（url）: #抓取url输出html 如果url有问题返回 ""

try: # 必须使用try 否则报错会会停止比如打开的是404页面或则域名

response=request.urlopen（url） #最使用request下的urlopen请求网址

if url!=response.geturl（）: #如跳转则为空

print（'URL发生跳转返回空'）

return ""

html=response.read（）.decode（'utf-8'） #解码

return html

except error.URLError as e: #服务器找的到但是网页打不开

if hasattr（e，'code'）: #网页被删除

print（"HTTPError"）

print（e.code）

print（"URL无法打开页面被删除"）

elif hasattr（e，'reason'）: #域名有问题

print（"URLError"）

print（e.reason）

print（"URL无法打开域名无法解析"）

return ""

'''

urldomainerror='https://www.doub11an.com/group/topic/117023251/'

url301='https://www.douban.com/group/topic/117023251/'

url404='https://www.douban.com/gro11up/topic/116903069/'

print（urltohtml（urldomainerror））

def urltohtml（url）: #抓取url输出html 如果url有问题返回 ""

try: # 必须使用try 否则报错会会停止比如打开的是404页面或则域名

response=request.urlopen（url） #最使用request下的urlopen请求网址

if url!=response.geturl（）: #如跳转则为空

print（'URL发生跳转返回空'）

return ""

html=response.read（）.decode（'utf-8'） #解码

return html

except error.URLError as e: #服务器找的到但是网页打不开

if hasattr（e，'code'）: #网页被删除

print（"HTTPError"）

print（e.code）

print（"URL无法打开页面被删除"）

elif hasattr（e，'reason'）: #域名有问题

print（"URLError"）

print（e.reason）

print（"URL无法打开域名无法解析"）

return ""

'''

urldomainerror='https://www.doub11an.com/group/topic/117023251/'

url301='https://www.douban.com/group/topic/117023251/'

url404='https://www.douban.com/gro11up/topic/116903069/'

print（urltohtml（urldomainerror））