forked from CSSAW/DWA-Data-Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper_functions.py
97 lines (84 loc) · 4.6 KB
/
scraper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from bs4 import BeautifulSoup
import os
import urllib
import pickle
import requests
import sys
import pandas as pd
import io
# Necessary for pickle on soup objects
sys.setrecursionlimit(100000)
def documentDownloader(urlString, alwaysDownload=False, storeData=False, dataPath="raw_data"):
# Returns the soup for a particular document. It will also store the file in the /raw_data directory
documentPath = os.path.join(dataPath,urlString.replace("/", "|"))
soup = None
if not os.path.exists(dataPath):
os.mkdir(dataPath)
if alwaysDownload or not os.path.exists(documentPath):
print("Downloading data from",urlString)
html = urllib.request.urlopen(urlString)
soup = BeautifulSoup(html, "html.parser")
print("Data downloaded")
if storeData:
dataFile = open(documentPath, "wb")
pickle.dump(soup, dataFile)
dataFile.close()
else:
print("Data already stored, obtaining. URL:", urlString)
dataFile = open(documentPath, "rb")
soup = pickle.load(dataFile)
dataFile.close()
return soup
def getStationData(stationInterfaceURL, alwaysDownload=False, storeData=False, dataPath="raw_data"):
# Returns a dictionary with
# df:Pandas dataframe with the data from the station
# lattitude:the lattitude of the station
# longitude:The longitude of the station
# station:The name of the station
# river: the river that the station is on
outputDict = {}
stationInferface = documentDownloader(stationInterfaceURL,alwaysDownload=alwaysDownload, storeData=storeData)
baseURL = stationInterfaceURL[:stationInterfaceURL.rfind("/")+1]
outputDict["lat"] = stationInferface.find(id="tbLat").get("value")
outputDict["long"] = stationInferface.find(id="tbLong").get("value")
outputDict["station"] = stationInferface.find(id="tbStation").get("value")
outputDict["river"] = stationInferface.find(id="labPlace").get_text().strip()
# The construction of the URL below was created by analysing the output of a form
stationDataURLPrimary = baseURL + "HyData.aspx?Station=" + stationInferface.find(id="tbStation").get("value") +stationInferface.find("input", {"name":"ctl05"}).get("value") \
+ "&DataType=Point&StartDT=" + stationInferface.find("input", {"name":"ctl06"}).get("value") \
+ "&EndDT=" + stationInferface.find("input", {"name":"tbEnd_0"}).get("value") \
+ "&SiteType=" + stationInferface.find("input", {"name":"tbSiteType"}).get("value")
stationDataPrimary = documentDownloader(stationDataURLPrimary,alwaysDownload=alwaysDownload, storeData=storeData)
# Remove any non-table items from the webPage.
foundPre = stationDataPrimary.find("pre")
if foundPre != None:
stationDataStrPrimary = foundPre.get_text()
substrStartKeyword = "Surface Water Level\n"
stationDataStrPrimary = stationDataStrPrimary[stationDataStrPrimary.find(substrStartKeyword)+len(substrStartKeyword):]
# Use a stringIo object to use the pandas read_fwf function.
# This converts a pure text table with a bunch of columns to a pandas dataframe.
strBuffer = io.StringIO(stationDataStrPrimary)
outputDict["dfPrimary"] = pd.read_fwf(strBuffer)
strBuffer.close()
else:
outputDict["dfPrimary"] = None
# The construction of the URL below was created by analysing the output of a form
stationDataURLFlow = baseURL + "HyData.aspx?Station=" + stationInferface.find(id="tbStation").get("value") +stationInferface.find("input", {"name":"ctl05"}).get("value") \
+ "&DataType=Daily&StartDT=" + stationInferface.find("input", {"name":"ctl06"}).get("value") \
+ "&EndDT=" + stationInferface.find("input", {"name":"tbEnd_0"}).get("value") \
+ "&SiteType=" + stationInferface.find("input", {"name":"tbSiteType"}).get("value")
stationDataFlow = documentDownloader(stationDataURLFlow,alwaysDownload=alwaysDownload, storeData=storeData)
# Remove any non-table items from the webPage.
foundPre = stationDataFlow.find("pre")
if foundPre != None:
stationDataStrFlow = foundPre.get_text()
substrStartKeyword = "DATE D AVG F/R QUAL"
stationDataStrFlow = stationDataStrFlow[stationDataStrFlow.find(substrStartKeyword):]
# Use a stringIo object to use the pandas read_fwf function.
# This converts a pure text table with a bunch of columns to a pandas dataframe.
strBuffer = io.StringIO(stationDataStrFlow)
outputDict["dfFlow"] = pd.read_fwf(strBuffer)
strBuffer.close()
else:
outputDict["dfFlow"] = None
return outputDict