From 1246dc373ae657733854621d77647653248a1a56 Mon Sep 17 00:00:00 2001 From: colttaine Date: Sun, 5 Mar 2023 18:33:51 +1100 Subject: [PATCH] Added macrotrend scraping --- conf/macrotrends.txt | 6 + masterscraper/__init__.py | 169 ++++++++++++++++++++++---- masterscraper/macrotrends/__init__.py | 118 ++++++++++++++++++ masterscraper/wikipedia/__init__.py | 9 +- scrape_all.py | 12 ++ scraper.py => scrape_list.py | 0 scrape_single.py | 11 ++ 7 files changed, 299 insertions(+), 26 deletions(-) create mode 100644 conf/macrotrends.txt create mode 100644 masterscraper/macrotrends/__init__.py create mode 100644 scrape_all.py rename scraper.py => scrape_list.py (100%) create mode 100644 scrape_single.py diff --git a/conf/macrotrends.txt b/conf/macrotrends.txt new file mode 100644 index 0000000..886ac44 --- /dev/null +++ b/conf/macrotrends.txt @@ -0,0 +1,6 @@ +https://www.macrotrends.net/countries/CHN/china/population +https://www.macrotrends.net/countries/CHN/china/healthcare-spending +https://www.macrotrends.net/countries/CHN/china/life-expectancy +https://www.macrotrends.net/countries/CHN/china/unemployment-rate +https://www.macrotrends.net/countries/CHN/china/gdp-per-capita +https://www.macrotrends.net/countries/BRA/brazil/gdp-per-capita diff --git a/masterscraper/__init__.py b/masterscraper/__init__.py index 828062a..f0222c6 100644 --- a/masterscraper/__init__.py +++ b/masterscraper/__init__.py @@ -43,12 +43,16 @@ class scrape: #--------[ Scrape Constructor Object ]--------# def __init__(self, url): + + print('\n[{0}]'.format(url)) + self.meta = { "name" : None, "description" : None, "units" : None, "year" : None, "notes" : [], + "id" : None, "type" : None, "scope" : None, "category" : None, @@ -83,6 +87,9 @@ class scrape: #--------[ Get Metadata ]--------# def get_meta(self): + # Break if scrape contains no data + if len(self.data) <= 1: return(1) + # Process Name self.meta['name'] = self.meta['name'].lower() self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name']) @@ -103,10 +110,10 @@ class scrape: key.lower().find('dependency') >=0 ): key_name.append('country.name') elif(key.lower().find('year') >=0): - key_name.append('country.name') + key_name.append('year') else: - tmp_key = self.meta['name'] - tmp_key = tmp_key + ' ' + key + + tmp_key = key tmp_key = tmp_key.lower() tmp_key = re.sub('\[.*\]', '', tmp_key) @@ -120,6 +127,10 @@ class scrape: tmp_key = tmp_key.strip() tmp_key = tmp_key.replace(' ','.') + + if tmp_key != self.meta['name'].lower().replace(' ','.'): + tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key + key_name.append( tmp_key ) self.data_info.append( key_name ) @@ -170,6 +181,11 @@ class scrape: for key in self.data_info[0]: if re.match('\d\d\d\d', key): key_year.append( key ) + elif 'year' in self.data_info[1]: + y1 = self.data[1][self.data_info[1].index('year')] + y2 = self.data[-1][self.data_info[1].index('year')] + if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) ) + if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) ) else: key_year.append( date.today().strftime('%Y') ) self.data_info.append( key_year ) @@ -201,22 +217,99 @@ class scrape: # Get Category search = self.meta['name'].join(self.data_info[0]).lower().strip() + + #--------[ Geographic ]--------# if( search.find('area') >=0 or search.find('km2') >=0): self.meta['category'] = 'geographic' self.meta['subcategory'] = 'area' - elif( search.find('depression') >= 0 or + + #--------[ Demographic ]-------- + elif( search.find('population') >=0 ): + self.meta['category'] = 'demogrpahic' + + elif( search.find('birth') >=0 or + search.find('fertility') >=0 ): + self.meta['category'] = 'demogrpahic' + self.meta['subcategory'] = 'fertility' + + #--------[ Health ]--------# + elif( search.find('life expectancy') >=0 or + search.find('death') >=0 or + search.find('suicide') >=0 or + search.find('mortality') >=0 ): + self.meta['category'] = 'health' + self.meta['subcategory'] = 'mortality' + + elif( search.find('depression') >=0 or search.find('anxiety') >=0 ): self.meta['category'] = 'health' self.meta['subcategory'] = 'psychology' - elif( search.find('economic') >= 0 or - search.find('gdp') >=0 ): + + elif( search.find('smoking') >= 0 or + search.find('alcohol') >=0 ): + self.meta['category'] = 'health' + self.meta['subcategory'] = 'drugs' + + #--------[ Economic ]--------# + elif( search.find('gdp') >=0 and + search.find('trade') <0 and + search.find('health') <0 ): self.meta['category'] = 'economic' self.meta['subcategory'] = 'gdp' - elif( search.find('development') >= 0 or + + elif( search.find('gni') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'gni' + + elif( search.find('debt') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'debt' + + elif( search.find('inflation') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'inflation' + + elif( search.find('health') >=0 and + search.find('spend') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'welfare' + + elif( search.find('manufature') >=0 or + search.find('business') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'business' + + elif( search.find('import') >=0 or + search.find('export') >=0 or + search.find('invest') >=0 or + search.find('tarrif') >=0 or + search.find('trade') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'trade' + + elif( search.find('unemployment') >=0 or + search.find('labor') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'labor-force' + + #--------[ Education ]--------# + elif( search.find('education') >=0 or + search.find('literacy') >=0 ): + self.meta['category'] = 'education' + + #--------[ Development ]--------# + elif( search.find('development') >=0 or search.find('competitive') >=0 ): - self.meta['category'] = 'technology' - self.meta['subcategory'] = 'development' + self.meta['category'] = 'development' + + #--------[ Crime ]--------# + elif( search.find('crime') >=0 or + search.find('homocide') >=0 or + search.find('murder') >=0 ): + self.meta['category'] = 'development' + + #--------[ Uncategorised ]--------# else: self.meta['category'] = 'uncategorised' @@ -232,6 +325,10 @@ class scrape: #--------[ Clean Scrape Data ]--------# def clean(self): + + # Break if scrape contains no data + if len(self.data) <= 1: return(1) + for x in range(1, len(self.data)): for y in range(0, len(self.data[x])): self.data[x][y] = self.data[x][y] @@ -259,7 +356,6 @@ class scrape: if self.data[x][y].is_integer(): self.data[x][y] = int(self.data[x][y]) - # Convert non-entries to null if isinstance(self.data[x][y], str): if( self.data[x][y].lower().find('not determined') >= 0 or @@ -272,11 +368,12 @@ class scrape: self.data[x][y] = None + #--------[ Save Scrape Data ]--------# def save(self): - print('\n', self.meta['sources']) - + # Break if scrape contains no data + if len(self.data) <= 1: return(1) key_main = 0 for i in range(0, len(self.data_info[1])): @@ -291,17 +388,22 @@ class scrape: #--------[ Generate Filename ]--------# filename = self.data_info[1][key_data].replace('.','-') - filepath = 'data/{0}/{1}'.format(self.meta['type'], self.meta['category']) - if self.meta['subcategory'] != None: filepath = filepath + '/' + self.meta['subcategory'] - if len(self.data[0]) > 4: - filepath = filepath + '/' + self.meta['name'].lower().replace(' ','-') - if not os.path.exists(filepath): - os.makedirs(filepath) + filepath = 'data/{0}'.format(self.meta['type']) + if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-') + filepath += '/{0}'.format(self.meta['category']) + if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory'] + if len(self.data[0]) > 4: + filepath += '/' + self.meta['name'].lower().replace(' ','-') fullpath = filepath + '/' + filename + '.json' + #--------[ Check File Directory ]--------# + if not os.path.exists(filepath): + os.makedirs(filepath) + + #--------[ Open File ]--------# f = open(fullpath, "w") f.write('{\n') @@ -310,7 +412,9 @@ class scrape: #--------[ Update Metadata ]--------# self.meta['units'] = self.data_info[2][key_data] self.meta['year'] = self.data_info[4][key_data] - self.meta['scope'] = self.data_info[5][key_data] + + if self.meta['scope'] == None: + self.meta['scope'] = self.data_info[5][key_data] #--------[ Write Metadata ] f.write(' "metadata" : {\n') @@ -343,13 +447,30 @@ class scrape: #--------[ Write Actual Data ]--------# f.write(' "data" : [\n') - f.write(' ["{0}","{1}"],\n'.format(self.data_info[1][key_main], self.data_info[1][key_data])) + + if self.meta['type'] == 'historical': + f.write(' ["{0}","{1}"],\n'.format( + self.data_info[1][key_main], + self.meta['id'] + '.' + self.data_info[1][key_data]) + ) + else: + f.write(' ["{0}","{1}"],\n'.format( + self.data_info[1][key_main], + self.data_info[1][key_data]) + ) for row in self.data[1:]: - if row[key_data] != None: - f.write(' ["{0}",{1}]'.format(row[key_main], row[key_data])) - else: - f.write(' ["{0}",null]'.format(row[key_main])) + col_a = row[key_main] + col_b = row[key_data] + + if isinstance(col_a, str): col_a = '"{0}"'.format(col_a) + if isinstance(col_b, str): col_b = '"{0}"'.format(col_b) + + if col_a == None: col_a = 'null' + if col_b == None: col_b = 'null' + + f.write(' [{0},{1}]'.format(col_a, col_b)) + if row != self.data[-1]: f.write(',\n') else: f.write('\n') f.write(' ]\n') diff --git a/masterscraper/macrotrends/__init__.py b/masterscraper/macrotrends/__init__.py new file mode 100644 index 0000000..a7df950 --- /dev/null +++ b/masterscraper/macrotrends/__init__.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 + +import requests +import pandas as pd +import re +from bs4 import BeautifulSoup +from datetime import date + + + +def getpage(url): + #--------[ Get Page From URL ]--------# + headers = { + 'Access-Control-Allow-Origin': '*', + 'Access-Control-Allow-Methods': 'GET', + 'Access-Control-Allow-Headers': 'Content-Type', + 'Access-Control-Max-Age': '3600', + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' + } + page = requests.get(url, headers) + soup = BeautifulSoup(page.content, 'html.parser') + return soup + + + +def scrapelist(): + soup = getpage('https://www.macrotrends.net/countries/topic-overview') + + # Get URL list of global metrics + links = [] + for table in soup.find_all('div', class_='col-xs-3'): + for link in table.find_all('a'): + links.append('https://www.macrotrends.net' + link['href']) + + # Get full country list for each global metric + full_list = [] + for link in links: + soup = getpage(link) + table = soup.find('div', class_='col-xs-12') + for url in table.find_all('a'): + full_list.append('https://www.macrotrends.net/' + url['href']) + print(url['href']) + #break + + print('\nScraping {0} datasets from MacroTrends\n'.format( len(full_list) )) + + return full_list + + + + +def scrape(url, meta, data): + #--------[ Get Page From URL ]--------# + soup = getpage(url) + + + #--------[ Get Metadata ]--------# + url_parts = url.split('/') + + meta['name'] = url_parts[-1].replace('-',' ').title() + + meta['description'] = soup.find('h1').text + + meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text ) + + meta['sources'].append( url ) + + meta['scope'] = url_parts[-2].replace('-',' ').title() + + meta['id'] = url_parts[-3].lower() + + + + #--------[ Extract Table ]--------# + table = soup.find('div', class_='col-xs-6') + table = table.find('table', class_='historical_data_table') + + # Get Table Headings + for tr in table.find_all('tr'): + row = [ th.text.strip() for th in tr.find_all('th')] + if len(row) > 1: + data.append( row ) + + # Get Table Data + for tr in table.find_all('tr'): + row = [ td.text.strip() for td in tr.find_all('td')] + if len(row) > 1: + data.append( row ) + + #--------[ Process Table ]-------- + + # Delete rows with incorrect number of variables + key = 0 + key_len = len(data) + while key < key_len: + if len(data[key]) != len(data[0]): + data.pop(key) + key = key-1 + key = key+1 + key_len = len(data) + + # Delete unwanted table columns + key = 0 + key_len = len(data[0]) + while key < key_len: + flag = False + if data[0][key].lower().find('rank') >=0: flag = True + if data[0][key].lower().find('change') >=0: flag = True + if data[0][key].lower().find('notes') >=0: flag = True + if data[0][key].lower().find('gap') >=0: flag = True + if data[0][key].lower().find('Δ') >=0: flag = True + if data[0][key].lower().find('growth') >=0: flag = True + if flag: + for i in range(0, len(data)): + data[i].pop(key) + key = key-1 + key = key+1 + key_len = len(data[0]) diff --git a/masterscraper/wikipedia/__init__.py b/masterscraper/wikipedia/__init__.py index 2f6b896..c4877c2 100644 --- a/masterscraper/wikipedia/__init__.py +++ b/masterscraper/wikipedia/__init__.py @@ -7,9 +7,8 @@ from bs4 import BeautifulSoup from datetime import date -def scrape(url, meta, data): - +def getpage(url): #--------[ Get Page From URL ]--------# headers = { 'Access-Control-Allow-Origin': '*', @@ -20,6 +19,12 @@ def scrape(url, meta, data): } page = requests.get(url, headers) soup = BeautifulSoup(page.content, 'html.parser') + return soup + + +def scrape(url, meta, data): + #--------[ Get Page From URL ]--------# + soup = getpage(url) #--------[ Get Metadata ]--------# diff --git a/scrape_all.py b/scrape_all.py new file mode 100644 index 0000000..dc48172 --- /dev/null +++ b/scrape_all.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 + + +import masterscraper as ms + + +scrapelist = ms.macrotrends.scrapelist() +for url in scrapelist: + scrape = ms.scrape(url) + scrape.get_meta() + scrape.clean() + scrape.save() diff --git a/scraper.py b/scrape_list.py similarity index 100% rename from scraper.py rename to scrape_list.py diff --git a/scrape_single.py b/scrape_single.py new file mode 100644 index 0000000..1851576 --- /dev/null +++ b/scrape_single.py @@ -0,0 +1,11 @@ +#!/usr/bin/python3 + + +import masterscraper as ms + + + +scrape = ms.scrape('https://www.macrotrends.net/countries/TUR/turkey/population') +scrape.get_meta() +scrape.clean() +scrape.save()