From b7cd0fcd4782656bbd2d8d71f2defa4eda715d10 Mon Sep 17 00:00:00 2001 From: RisingGeek Date: Mon, 30 Sep 2019 21:18:54 +0530 Subject: [PATCH 1/2] organization name fetched for previous years --- Scraping Code/gsoc_yearly_data_generator.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Scraping Code/gsoc_yearly_data_generator.py b/Scraping Code/gsoc_yearly_data_generator.py index 9018b64..396da93 100644 --- a/Scraping Code/gsoc_yearly_data_generator.py +++ b/Scraping Code/gsoc_yearly_data_generator.py @@ -24,13 +24,18 @@ def extraction(year): file = open("Data/GSOC_"+str(year)+"_Data.ods", "w") with requests.Session() as c: - if(year!=2019): page = c.get("https://summerofcode.withgoogle.com/archive/" + str(year) + "/organizations/") - else: page = c.get("https://summerofcode.withgoogle.com/organizations/#6230025286713344") + if(year==2019): page = c.get("https://summerofcode.withgoogle.com/organizations/#6230025286713344") + elif(year>=2016): page = c.get("https://summerofcode.withgoogle.com/archive/" + str(year) + "/organizations/") + else: page = c.get("https://www.google-melange.com/archive/gsoc/" + str(year)) plain_text = page.text soup = BeautifulSoup(plain_text, "lxml") dict_year = {} gsoc_year_organizations = [] - for name in soup.findAll('h4',{'class': 'organization-card__name font-black-54'}): + gsoc_organizations = soup.findAll('h4', {'class': 'organization-card__name font-black-54'}) + if(year<2016): + gsoc_organizations = soup.find('ul', {'class': 'mdl-list'}).findChildren('a') + + for name in gsoc_organizations: title = name.string gsoc_year_organizations.append(title) dict_year[title] = [] From 9e04c5a844e2670b8bf8f396073602a582ca5ae4 Mon Sep 17 00:00:00 2001 From: RisingGeek Date: Mon, 30 Sep 2019 23:50:02 +0530 Subject: [PATCH 2/2] organization links for previous year --- Scraping Code/gsoc_yearly_data_generator.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Scraping Code/gsoc_yearly_data_generator.py b/Scraping Code/gsoc_yearly_data_generator.py index 396da93..b75f516 100644 --- a/Scraping Code/gsoc_yearly_data_generator.py +++ b/Scraping Code/gsoc_yearly_data_generator.py @@ -39,11 +39,16 @@ def extraction(year): title = name.string gsoc_year_organizations.append(title) dict_year[title] = [] - + i=0 - for link in soup.findAll('a',{'class': 'organization-card__link'}): + links = soup.findAll('a',{'class': 'organization-card__link'}) + domain = "https://summerofcode.withgoogle.com" + if(year<2016): + links = soup.find('ul', {'class': 'mdl-list'}).findChildren('a') + domain = "https://www.google-melange.com" + for link in links: hrefs = link.get('href') - dict_year[gsoc_year_organizations[i]].append('https://summerofcode.withgoogle.com'+hrefs) + dict_year[gsoc_year_organizations[i]].append(domain+hrefs) i+=1 count = i