# -*- coding: utf-8 -*-
import urllib
import urllib2
import random
import re
import bs4
import chardet
def getRequest(url,header):
request = urllib2.Request(url,headers=header)
response = urllib2.urlopen(request)
return response.read()
HEADER={
'Host': 'px.h2o-china.com',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.8.0.12)',
'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Language': 'en-us,en;q=0.5',
'Accept-Encoding': 'gzip,deflate',
'Accept-Charset': 'gbk;q=0.7,*;q=0.7',
'Referer': 'http://px.h2o-china.com/2014/view?key=YzBkZFgxdHRncEhMN0lKNlVDUmlkamloNU8xaGlJb204bnJXbXJkWWJhOVFzdGtQRnFWVnpsZDBKb3RoeWFMVzhoUmgxUlF1MElJZExualBaWEZPYUpz&from=singlemessage&isappinstalled=0',
'Cookie': 'PHPSESSID=mofsmmnds17rqueqcjih30k971'
}
html = getRequest('http://px.h2o-china.com/2014/view?key=MjgxMW1pYmhPR1oxZVFKSytEUjBnSHVkY3FEUnJiU0tWWXBWMWpUYWlxYk4zRFJycGZxVDhQKzd1a1ZBVVNaZHdMQ3NRbXR2T2Zxd1VNTVVYamdXcTcw',HEADER)
encodeofhtml= chardet.detect(html)
content = bs4.BeautifulSoup(html)
print encodeofhtml
print content
网上搜到的各种方法都试了,还是不行
请求了压缩的内容, 但是没有解压. 可以去解压, 当然也可以删掉下面这一行请求压缩:
'Accept-Encoding': 'gzip,deflate',