歩いたら休め

If the implementation is easy to explain, it may be a good idea.

【Python】Yahoo!天気をクロールして東京の過去のお天気データを取れました

これの続きです kiito.hatenablog.com

import csv
import time
import urllib.request
from bs4 import BeautifulSoup

# 年月日を作成
years = range(2010,2016)
months = range(1,13)
days = range(1,32)
dates = [[c,m,d] for c in years for m in months for d in days]

# 存在しない組み合わせを削除
dates = filter(lambda x: not ((x[1] in [2,4,6,9,11]) and x[2] == 31), dates)
dates = filter(lambda x: not (x[1] == 2 and x[2] == 30), dates)
dates = filter(lambda x: not (x[1] == 2 and x[2] == 29 and not x[0] == 2012), dates)

# 範囲外の日付を削除
dates = filter(lambda x: not (x[0] == 2010 and x[1] <= 9), dates)
dates = filter(lambda x: not (x[0] == 2010 and x[1] == 10 and x[2] <= 11), dates)
dates = filter(lambda x: not (x[0] == 2015 and x[1] >= 9), dates)

# クエリを作成
def make_query(date):
    date = [str(x) for x in date]
    return '?c=' + date[0] + '&m=' + date[1] + '&d=' + date[2]
queries = list(map(make_query, dates))

# 都道府県の県庁所在地のURLを作成
# 埼玉と岐阜とか、場所が微妙なところがあるかも
pref_codes = ['13/4410']
# ※さすがに一気に全都道府県をスクレイピングするのはまずそうなのでやめました
"""
pref_codes= ['1b/1400', '2/3110', '3/3310', '4/3410', '5/3210',
             '6/3510', '7/3610', '8/4010', '9/4110', '10/4210',
             '11/4330', '12/4510', '13/4410', '14/4610', '15/5410',
             '16/5510', '17/5610', '18/5710', '19/4910', '20/4810', 
             '21/5220', '22/5040', '23/5110', '24/5310', '25/6020', 
             '26/6110', '27/6200', '28/6310', '29/6410', '30/6510',
             '31/6910', '32/6810', '33/6610', '34/6710', '35/8120',
             '36/7110', '37/7200', '38/7310', '39/7410', '40/8210',
             '41/8510', '42/8410', '43/8610', '44/8310', '45/8710',
             '46/8810', '47/9110']
"""

# urlのパラメータを作成
params = [[p, q] for p in pref_codes for q in queries]
params = map(lambda x: (x[0] + '/detail.html' + x[1]), params)

# urlの組み合わせを列挙
base_url = 'http://weather.yahoo.co.jp/weather/jp/past/'
urls = map(lambda x: base_url + str(x), params)

# urlを入れたらいい感じのリストを返してくれる関数
def html_parser(url):
    with urllib.request.urlopen(url) as response:
        html = response.read()
        soup = BeautifulSoup(html)
    tds = soup.body.findAll('table')[3].findAll('td')
    tds = list(map(lambda x: x.small.string, tds))
    tds = [tds[i] for i in [1, 4, 6, 8, 10, 13, 15, 18, 20, 22, 24, 26, 28]]
    return tds

# csvファイルに書き込む
with open('output.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for url in urls:
        line = [url]
        weather_data = html_parser(url)
        line.extend(weather_data)
        csvwriter.writerow(line)
        # スクレイピング業界では最短でも1秒ごとのアクセスに抑えろと言われているらしい
        # (一気にスクレイピングするとサーバーに負荷がかかってしまうため)
        time.sleep(1)

こんな感じのcsvが返ってきます

http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=12,晴れ,25.8℃,19.6℃,18.4℃,70%,東南東,5m/s,1008.8hPa,1013hPa,0.0mm,---,5:44,17:10
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=13,曇り,26.1℃,20.8℃,16.9℃,59%,東北東,3m/s,1005.4hPa,1009.6hPa,0.0mm,---,5:45,17:09
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=14,晴れ,23.4℃,19.5℃,14.8℃,60%,東,2m/s,1005.5hPa,1009.7hPa,0.0mm,---,5:46,17:08
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=15,晴れ,23.6℃,19.3℃,15.3℃,61%,南,2m/s,1004.1hPa,1008.3hPa,0.0mm,---,5:47,17:06
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=16,晴れ,24.8℃,18.6℃,13.2℃,51%,南,4m/s,1012.4hPa,1016.6hPa,0.0mm,---,5:48,17:05
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=17,晴れ,23.3℃,18.5℃,11.9℃,49%,北東,1m/s,1014.7hPa,1018.9hPa,0.0mm,---,5:49,17:04
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=18,晴れ,22.7℃,17.1℃,9.8℃,46%,東北東,2m/s,1019.4hPa,1023.7hPa,0.0mm,---,5:50,17:03
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=19,曇り,20.6℃,16.7℃,12.4℃,61%,北東,3m/s,1021.1hPa,1025.4hPa,0.0mm,---,5:50,17:01
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=20,曇り,19.8℃,17.9℃,13.0℃,65%,北北東,2m/s,1015hPa,1019.3hPa,0.0mm,---,5:51,17:00
http://weather.yahoo.co.jp/weather/jp/past/13/4410/detail.html?c=2010&m=10&d=21,曇り,19.4℃,17.6℃,15.3℃,81%,北,2m/s,1009.9hPa,1014.2hPa,0.0mm,---,5:52,16:59