これの続きです kiito.hatenablog.com
import csv import time import urllib.request from bs4 import BeautifulSoup # 年月日を作成 years = range(2010,2016) months = range(1,13) days = range(1,32) dates = [[c,m,d] for c in years for m in months for d in days] # 存在しない組み合わせを削除 dates = filter(lambda x: not ((x[1] in [2,4,6,9,11]) and x[2] == 31), dates) dates = filter(lambda x: not (x[1] == 2 and x[2] == 30), dates) dates = filter(lambda x: not (x[1] == 2 and x[2] == 29 and not x[0] == 2012), dates) # 範囲外の日付を削除 dates = filter(lambda x: not (x[0] == 2010 and x[1] <= 9), dates) dates = filter(lambda x: not (x[0] == 2010 and x[1] == 10 and x[2] <= 11), dates) dates = filter(lambda x: not (x[0] == 2015 and x[1] >= 9), dates) # クエリを作成 def make_query(date): date = [str(x) for x in date] return '?c=' + date[0] + '&m=' + date[1] + '&d=' + date[2] queries = list(map(make_query, dates)) # 都道府県の県庁所在地のURLを作成 # 埼玉と岐阜とか、場所が微妙なところがあるかも pref_codes = ['13/4410'] # ※さすがに一気に全都道府県をスクレイピングするのはまずそうなのでやめました """ pref_codes= ['1b/1400', '2/3110', '3/3310', '4/3410', '5/3210', '6/3510', '7/3610', '8/4010', '9/4110', '10/4210', '11/4330', '12/4510', '13/4410', '14/4610', '15/5410', '16/5510', '17/5610', '18/5710', '19/4910', '20/4810', '21/5220', '22/5040', '23/5110', '24/5310', '25/6020', '26/6110', '27/6200', '28/6310', '29/6410', '30/6510', '31/6910', '32/6810', '33/6610', '34/6710', '35/8120', '36/7110', '37/7200', '38/7310', '39/7410', '40/8210', '41/8510', '42/8410', '43/8610', '44/8310', '45/8710', '46/8810', '47/9110'] """ # urlのパラメータを作成 params = [[p, q] for p in pref_codes for q in queries] params = map(lambda x: (x[0] + '/detail.html' + x[1]), params) # urlの組み合わせを列挙 base_url = 'http://weather.yahoo.co.jp/weather/jp/past/' urls = map(lambda x: base_url + str(x), params) # urlを入れたらいい感じのリストを返してくれる関数 def html_parser(url): with urllib.request.urlopen(url) as response: html = response.read() soup = BeautifulSoup(html) tds = soup.body.findAll('table')[3].findAll('td') tds = list(map(lambda x: x.small.string, tds)) tds = [tds[i] for i in [1, 4, 6, 8, 10, 13, 15, 18, 20, 22, 24, 26, 28]] return tds # csvファイルに書き込む with open('output.csv', 'w', newline='') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for url in urls: line = [url] weather_data = html_parser(url) line.extend(weather_data) csvwriter.writerow(line) # スクレイピング業界では最短でも1秒ごとのアクセスに抑えろと言われているらしい # (一気にスクレイピングするとサーバーに負荷がかかってしまうため) time.sleep(1)
こんな感じのcsvが返ってきます
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=12,晴れ,25.8℃,19.6℃,18.4℃,70%,東南東,5m/s,1008.8hPa,1013hPa,0.0mm,---,5:44,17:10 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=13,曇り,26.1℃,20.8℃,16.9℃,59%,東北東,3m/s,1005.4hPa,1009.6hPa,0.0mm,---,5:45,17:09 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=14,晴れ,23.4℃,19.5℃,14.8℃,60%,東,2m/s,1005.5hPa,1009.7hPa,0.0mm,---,5:46,17:08 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=15,晴れ,23.6℃,19.3℃,15.3℃,61%,南,2m/s,1004.1hPa,1008.3hPa,0.0mm,---,5:47,17:06 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=16,晴れ,24.8℃,18.6℃,13.2℃,51%,南,4m/s,1012.4hPa,1016.6hPa,0.0mm,---,5:48,17:05 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=17,晴れ,23.3℃,18.5℃,11.9℃,49%,北東,1m/s,1014.7hPa,1018.9hPa,0.0mm,---,5:49,17:04 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=18,晴れ,22.7℃,17.1℃,9.8℃,46%,東北東,2m/s,1019.4hPa,1023.7hPa,0.0mm,---,5:50,17:03 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=19,曇り,20.6℃,16.7℃,12.4℃,61%,北東,3m/s,1021.1hPa,1025.4hPa,0.0mm,---,5:50,17:01 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=20,曇り,19.8℃,17.9℃,13.0℃,65%,北北東,2m/s,1015hPa,1019.3hPa,0.0mm,---,5:51,17:00 http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=21,曇り,19.4℃,17.6℃,15.3℃,81%,北,2m/s,1009.9hPa,1014.2hPa,0.0mm,---,5:52,16:59