-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape_utilities.py
170 lines (132 loc) · 5.98 KB
/
scrape_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import requests
import csv
import requests
from bs4 import BeautifulSoup
"""
Utility functions for OGC services management.
Functions:
remove_identical_lines(input_file, output_file):
Removes identical lines from a CSV file.
get_ogc_services_for_canton(canton_organization, csv_file, canton_short, rows=1000):
Retrieves a list of all WMS, WFS, and WMTS services for a specified canton from the Open Data Swiss API
and writes the results to a CSV file.
extract_urls_with_getcapabilities(source_url, csv_file, canton):
Extracts URLs containing "GetCapabilities" from a given source URL and stores the results in a CSV file.
"""
def remove_identical_lines(input_file, output_file):
"""
Removes identical lines from a CSV file.
Parameters:
input_file (str): The path to the input CSV file.
output_file (str): The path to the output CSV file without identical lines.
Returns:
None
# Example usage:
input_file = "input.csv"
output_file = "output.csv"
remove_identical_lines(input_file, output_file)
"""
# Set to store unique lines
unique_lines = set()
# Read input file and store unique lines
with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
reader = csv.reader(infile)
for row in reader:
unique_lines.add(tuple(row))
# Write unique lines to output file
with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
for line in unique_lines:
writer.writerow(line)
def get_ogc_services_for_canton(canton_organization, csv_file, canton_short, rows=1000):
"""
Retrieves a list of all WMS, WFS, and WMTS services for a specified canton from the Open Data Swiss API
and writes the results to a CSV file.
Parameters:
canton_organization (str): The organization name of the canton to filter datasets (e.g., 'geoinformation_kanton_freiburg').
csv_file (str): The path to the CSV file to store the results.
canton_short (str): The abbreviation of the canton.
rows (int): The number of rows to return. Defaults to 1000.
Returns:
None
Example usage:
canton_organization = 'geoinformation_kanton_freiburg'
csv_file = "ogc_services.csv"
canton_short = "KT_FR"
get_ogc_services_for_canton(canton_organization, csv_file, canton_short, rows=1000)
"""
url = "https://ckan.opendata.swiss/api/3/action/package_search"
params = {
'fq': f'organization:{canton_organization} AND (res_format:WMS OR res_format:WFS OR res_format:WMTS)',
'rows': rows
}
response = requests.get(url, params=params)
if response.status_code != 200:
raise Exception(f"Error {response.status_code}: {response.text}")
data = response.json()
ogc_services = []
for result in data['result']['results']:
for resource in result['resources']:
if resource['format'].lower() in ['wms', 'wfs', 'wmts']:
ogc_services.append({
'title': result['title'],
'url': resource['url'],
'format': resource['format'],
'description': result.get('notes', ''),
})
# Write the data to the CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL)
writer.writerow(["Canton", "URL"]) # Write the header row
for service in ogc_services:
writer.writerow([canton_short, service["url"]])
def extract_urls_with_getcapabilities(source_url, csv_file, canton):
"""
Extracts URLs containing "GetCapabilities" from a given source URL and stores the results in a CSV file.
Parameters:
source_url (str): The URL to parse and extract URLs from.
csv_file (str): The path to the CSV file to store the results.
canton (str): The canton abbreviation.
Returns:
None
Example usage:
source_url = "https://zg.ch/de/planen-bauen/geoinformation/geoinformationen-nutzen/geoinformationen-von-a-bis-z#WThemenkatalog"
csv_file = "ogc_services.csv"
canton = "KT_ZG"
extract_urls_with_getcapabilities(source_url, csv_file, canton)
"""
# Fetch the HTML content of the page
response = requests.get(source_url)
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Extract all URLs containing "GetCapabilities"
urls_with_get_capabilities = [link.get('href') for link in soup.find_all('a', href=True) if 'GetCapabilities' in link.get('href')]
# Write the data to the CSV file
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL)
writer.writerow(["Canton", "URL"]) # Write the header row
for url in urls_with_get_capabilities:
writer.writerow([canton, url])
def delete_matching_lines(file1_path, file2_path):
"""
Delete lines in file1 that are also present in file2.
Parameters:
file1_path (str): Path to the first file.
file2_path (str): Path to the second file.
# Example usage:
delete_matching_lines('path/to/file1.txt', 'path/to/file2.txt')
"""
# Read file2 and store its lines in a set for quick lookup
with open(file2_path, 'r') as file2:
file2_lines = set(file2.readlines())
# Read file1 and filter out lines that are also in file2
with open(file1_path, 'r') as file1:
file1_lines = file1.readlines()
# Create a list to store the new content of file1
new_file1_lines = [line for line in file1_lines if line not in file2_lines]
# Write the filtered content back to file1
with open(file1_path, 'w') as file1:
file1.writelines(new_file1_lines)
print(f"Lines from {file1_path} that are also in {file2_path} have been deleted.")
breakpoint()
print("testinh")