北京、天津工厂自动监测数据爬取

发布时间:2021-08-02 23:25 来源:https://blog.51cto.com/u_93993 阅读:190 作者:自然如来 栏目: web开发

&&&&&

# -*- coding: utf-8 -* from bs4 import BeautifulSoup import requests from xlwt import Workbook import time import datetime import sys reload(sys) sys.setdefaultencoding('utf-8') def shuju(url,date,page):     """     提取指定公司,指定日期、指定页面的自动监测数据     """     fromdata = {         "startTime":date,         "pageIndex":page,     }     try:         response = requests.post(url, data=fromdata)     except:         time.sleep(2)         response = requests.post(url, data=fromdata)     soup = BeautifulSoup(response.text, 'lxml')     shujulist = soup.find_all('tr')     datalist = []     for shuju in shujulist:         try:             linedata = shuju.find_all('td')             lin01 = linedata[0].text.split()[0]             lin02 = linedata[1].text.split()[0]             lin03 = linedata[2].text.split()[0]             lin04 = linedata[3].text.split()[0]             lin05 = linedata[4].text.split()[0]             lin06 = linedata[5].text.split()[0]             lin07 = linedata[6].text.split()[0]             lin08 = linedata[7].text.split()[0]             try:                 lin09 = linedata[8].text.split()[0]             except:                 lin09 = ''             lin10 = linedata[9].text.split()[0]             lin11 = linedata[10].text.split()[0]             lin12 = linedata[11].text.split()[0]             try:                 lin13 = linedata[12].text.split()[0]             except:                 lin13 = ''             data = [lin01,lin02,lin03,lin04,lin05,lin06,lin07,lin08,lin09,lin10,lin11,lin12,lin13]             datalist.append(data)         except:             pass     return datalist def pageNumber(url,date):     """     返回公司指定日期自动监测数据的页数,便于for循环遍历     """     fromdata = {         "startTime":date,         "pageIndex":"",     }     try:         response = requests.post(url, data=fromdata)     except:         time.sleep(1)         response = requests.post(url, data=fromdata)     soup = BeautifulSoup(response.text,'lxml')     number = soup.find('span',class_="clr_b ver_mid").string.split('/')[1][0]     compname = soup.find('div',class_="com_tit_new f_22 clr_3").string     return number,compname def Date_list_generation(start,end):     """     生成指定日期段的一个列表     """     datelist = []     datestart = datetime.datetime.strptime(str(start), '%Y-%m-%d')     dateend = datetime.datetime.strptime(str(end), '%Y-%m-%d')     while datestart < dateend:         datestart += datetime.timedelta(days=1)         datelist.append(datestart.strftime('%Y-%m-%d'))     return datelist def pao(start,end,url):     book = Workbook(encoding='utf-8')     sheet1 = book.add_sheet('Sheet 1')     sheet1.write(0, 0, u'序号')     sheet1.write(0, 1, u'监测点位')     sheet1.write(0, 2, u'监测时间')     sheet1.write(0, 3, u'监测项目')     sheet1.write(0, 4, u'监测结果')     sheet1.write(0, 5, u'标准限值')     sheet1.write(0, 6, u'单位')     sheet1.write(0, 7, u'是否达标')     sheet1.write(0, 8, u'超标倍数')     sheet1.write(0, 9, u'评价标准')     sheet1.write(0, 10, u'排放去向')     sheet1.write(0, 11, u'排放方式')     sheet1.write(0, 12, u'备注')     datalistnew = []     for date in Date_list_generation(start, end):         pagenumber, compname = pageNumber(url, date)         for page in range(1, int(pagenumber) + 1):             try:                 datalist = shuju(url, date, page)                 print date, page                 time.sleep(0.8)             except:                 print page             datalistnew = datalistnew + datalist         time.sleep(0.8)     datalist = datalistnew     for data in range(0, len(datalist)):         culumn01 = datalist[data][0]         culumn02 = datalist[data][1]         culumn03 = datalist[data][2]         culumn04 = datalist[data][3]         culumn05 = datalist[data][4]         culumn06 = datalist[data][5]         culumn07 = datalist[data][6]         culumn08 = datalist[data][7]         culumn09 = datalist[data][8]         culumn10 = datalist[data][9]         culumn11 = datalist[data][10]         culumn12 = datalist[data][11]         culumn13 = datalist[data][12]         sheet1.write(data + 1, 0, culumn01)         sheet1.write(data + 1, 1, culumn02)         sheet1.write(data + 1, 2, culumn03)         sheet1.write(data + 1, 3, culumn04)         sheet1.write(data + 1, 4, culumn05)         sheet1.write(data + 1, 5, culumn06)         sheet1.write(data + 1, 6, culumn07)         sheet1.write(data + 1, 7, culumn08)         sheet1.write(data + 1, 8, culumn09)         sheet1.write(data + 1, 9, culumn10)         sheet1.write(data + 1, 10, culumn11)         sheet1.write(data + 1, 11, culumn12)         sheet1.write(data + 1, 12, culumn13)     tablename = "%s_%s_%s.xls" % (compname, start, end)     book.save(tablename) if __name__ == "__main__":     start = "2017-05-01"     end = "2017-06-01"     url = ""     pao(start,end,url)


&&&&&

免责声明:本站发布的内容(图片、视频和文字)以原创、来自互联网转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:ts@56dr.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。