Broke program appart into smaller files
This commit is contained in:
parent
2cf406bdaa
commit
8cd30b40da
18 changed files with 786 additions and 720 deletions
|
@ -1,44 +1,26 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import re
|
||||
import os
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import date
|
||||
|
||||
from . import wikipedia
|
||||
from . import macrotrends
|
||||
|
||||
|
||||
|
||||
# Check If String Is Number
|
||||
def isfloat(num):
|
||||
try:
|
||||
float(num)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
# Load URL Scrape List
|
||||
def scrapelist(filename):
|
||||
f = open(filename, 'r')
|
||||
tmp_list = [l.strip() for l in f.readlines()]
|
||||
f.close()
|
||||
return(tmp_list)
|
||||
|
||||
|
||||
|
||||
class scrape:
|
||||
#--------[ Global Variables ]--------#
|
||||
meta = {} # Metadata
|
||||
data = [] # Actual Data
|
||||
data_info = [] # Variable information to split data table to seperate JSON files
|
||||
#--------[ Import Module Parts ]--------#
|
||||
from . import wikipedia
|
||||
from . import macrotrends
|
||||
|
||||
from .core.meta_name import meta_name
|
||||
from .core.meta_search import meta_search
|
||||
from .core.meta_keys import meta_keys
|
||||
from .core.meta_year import meta_year
|
||||
from .core.meta_units import meta_units
|
||||
from .core.meta_multiplyer import meta_multiplyer
|
||||
from .core.meta_scope import meta_scope
|
||||
from .core.meta_category import meta_category
|
||||
from .core.meta_type import meta_type
|
||||
from .core.meta_tags import meta_tags
|
||||
|
||||
from .core.get_list import get_list
|
||||
from .core.show import show
|
||||
from .core.clean import clean
|
||||
from .core.save import save
|
||||
|
||||
|
||||
#--------[ Scrape Constructor Object ]--------#
|
||||
|
@ -47,28 +29,25 @@ class scrape:
|
|||
print('\n[{0}]'.format(url))
|
||||
|
||||
self.meta = {
|
||||
"name" : None,
|
||||
"description" : None,
|
||||
"units" : None,
|
||||
"year" : None,
|
||||
"notes" : [],
|
||||
"id" : None,
|
||||
"type" : None,
|
||||
"scope" : None,
|
||||
"category" : None,
|
||||
"subcategory" : None,
|
||||
"tags" : [],
|
||||
"authors" : [],
|
||||
"sources" : []
|
||||
"name" : None, # Variable/Set name
|
||||
"description" : None, # Description of variable/set
|
||||
"units" : None, # Units of variable
|
||||
"year" : None, # Year(s) of variable
|
||||
"notes" : [], # Any notes related to the variable/set
|
||||
"id" : None, # Official ID of applicable
|
||||
"type" : None, # Type of variable/set
|
||||
"scope" : None, # Scope of the variable/set
|
||||
"category" : None, # Main category of the variable/set
|
||||
"subcategory" : None, # Subcategory of the variable/set
|
||||
"tags" : [], # Search tags applicable to the variable/set
|
||||
"authors" : [], # Person or organisation responsible for the data
|
||||
"sources" : [] # URL Sources for the data
|
||||
}
|
||||
self.data = []
|
||||
self.data_info = []
|
||||
|
||||
if url.find('wikipedia.org') >=0:
|
||||
wikipedia.scrape(url, self.meta, self.data)
|
||||
if url.find('macrotrends.net') >= 0:
|
||||
macrotrends.scrape(url, self.meta, self.data)
|
||||
self.data = [] # The actual data set
|
||||
self.info = {} # Temoporary metadata extracted from the data set
|
||||
|
||||
if url.find('wikipedia.org') >=0: self.wikipedia.scrape(self, url )
|
||||
if url.find('macrotrends.net') >=0: self.macrotrends.scrape(self, url )
|
||||
|
||||
|
||||
#--------[ Scrape Deconstructor ]--------#
|
||||
|
@ -76,623 +55,17 @@ class scrape:
|
|||
pass
|
||||
|
||||
|
||||
|
||||
#--------[ Show Scrape Data ]--------#
|
||||
def show(self):
|
||||
print(self.meta)
|
||||
for row in self.data:
|
||||
print(row)
|
||||
|
||||
|
||||
#--------[ Get Metadata ]--------#
|
||||
def get_meta(self):
|
||||
|
||||
# Break if scrape contains no data
|
||||
if len(self.data) <= 1: return(1)
|
||||
|
||||
# Process Name
|
||||
self.meta['name'] = self.meta['name'].lower()
|
||||
self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name'])
|
||||
self.meta['name'] = re.sub('list\ of\ ','',self.meta['name'])
|
||||
self.meta['name'] = re.sub(',','',self.meta['name'])
|
||||
self.meta['name'] = self.meta['name'].strip()
|
||||
self.meta['name'] = self.meta['name'].title()
|
||||
|
||||
self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP')
|
||||
self.meta['name'] = self.meta['name'].replace('Gni', 'GNI')
|
||||
self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP')
|
||||
|
||||
|
||||
# Get Key Names Search Spaces
|
||||
#self.data_info.append( [key for key in self.data[0]])
|
||||
key_search = []
|
||||
for i in range(0, len(self.data[0])):
|
||||
key_search.append(
|
||||
self.meta['name'].lower() + ' ' +
|
||||
self.data[0][i].lower() + ' ' +
|
||||
self.data[1][i].lower()
|
||||
)
|
||||
|
||||
self.data_info.append( key_search )
|
||||
|
||||
|
||||
# Process Variable Key Names
|
||||
key_name = []
|
||||
for key in self.data[0]:
|
||||
if(key.lower().find('country') >=0 or
|
||||
key.lower().find('countries') >=0 or
|
||||
key.lower().find('dependency') >=0 ):
|
||||
key_name.append('country.name')
|
||||
elif(key.lower().find('year') >=0):
|
||||
key_name.append('year')
|
||||
elif(key.lower().find('date') >=0):
|
||||
key_name.append('date')
|
||||
else:
|
||||
|
||||
tmp_key = key
|
||||
tmp_key = tmp_key.lower()
|
||||
|
||||
tmp_key = re.sub(',', '', tmp_key)
|
||||
tmp_key = re.sub('\[.*\]', '', tmp_key)
|
||||
tmp_key = re.sub('\(.*\)', '', tmp_key)
|
||||
tmp_key = re.sub('km2', '', tmp_key)
|
||||
tmp_key = re.sub('km', '', tmp_key)
|
||||
tmp_key = re.sub('mi2', '', tmp_key)
|
||||
tmp_key = re.sub('hectares', '', tmp_key)
|
||||
tmp_key = re.sub('\ in\ ', '', tmp_key)
|
||||
tmp_key = re.sub('US\ \$', '', tmp_key)
|
||||
tmp_key = re.sub('\$', 'dollars', tmp_key)
|
||||
tmp_key = re.sub('\%', 'percent', tmp_key)
|
||||
|
||||
tmp_key = re.sub('and\ dependencies ', '', tmp_key)
|
||||
tmp_key = re.sub('list\ of\ countries\ by\ ', '', tmp_key)
|
||||
|
||||
tmp_key = re.sub('thousands\ of', '' ,tmp_key)
|
||||
tmp_key = re.sub('millions\ of', '' ,tmp_key)
|
||||
tmp_key = re.sub('billions\ of', '' ,tmp_key)
|
||||
|
||||
tmp_key = re.sub('per\ 100k\ live\ births', '', tmp_key)
|
||||
tmp_key = re.sub('per\ 100k\ population', '', tmp_key)
|
||||
|
||||
tmp_key = tmp_key.strip()
|
||||
tmp_key = tmp_key.replace(' ','.')
|
||||
|
||||
if tmp_key.find(self.meta['name'].lower().replace(' ','.')) <0:
|
||||
if tmp_key != '':
|
||||
tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key
|
||||
else:
|
||||
tmp_key = self.meta['name'].lower().replace(' ','.')
|
||||
|
||||
key_name.append( tmp_key )
|
||||
self.data_info.append( key_name )
|
||||
|
||||
|
||||
# Process Variable Unit Type
|
||||
key_unit = []
|
||||
for key in self.data_info[0]:
|
||||
|
||||
if( key.find('percent') >=0 or
|
||||
key.find('perc') >=0 or
|
||||
key.find('%') >=0 ):
|
||||
key_unit.append('%')
|
||||
|
||||
elif( key.find('dollar') >=0 or
|
||||
key.find('$') >=0 ):
|
||||
key_unit.append('$')
|
||||
|
||||
elif( key.find('euro') >=0 or
|
||||
key.find('€') >=0 ):
|
||||
key_unit.append('€')
|
||||
|
||||
elif( key.find('area') >=0 or
|
||||
key.find('land') >=0 or
|
||||
key.find('km2') >=0 or
|
||||
key.find('km²') >=0 or
|
||||
key.find('mi2') >=0 or
|
||||
key.find('mi²') >=0 or
|
||||
key.find('ha') >=0 or
|
||||
key.find('hectares') >=0 ):
|
||||
key_unit.append('km²')
|
||||
|
||||
elif( key.find('country') >=0 or
|
||||
key.find('countries') >=0 or
|
||||
key.find('dependencies') >=0 ):
|
||||
key_unit.append('countries')
|
||||
|
||||
elif( key.find('index') >=0 or
|
||||
key.find('score') >=0 or
|
||||
key.find('report') >=0 ):
|
||||
key_unit.append('index')
|
||||
|
||||
elif( key.find('population') >=0 and
|
||||
key.find('density') <0 and
|
||||
key.find('access') <0 and
|
||||
key.find('crime') <0 and
|
||||
key.find('murder') <0 ):
|
||||
key_unit.append('people')
|
||||
|
||||
elif( key.find('population') >=0 and
|
||||
key.find('density') >=0 ):
|
||||
key_unit.append('people/km²')
|
||||
|
||||
elif( (key.find('death') >=0 or
|
||||
key.find('mortality') >=0) and
|
||||
key.find('rate') >=0 and
|
||||
key.find('infant') <0 and
|
||||
key.find('maternal') <0 ):
|
||||
key_unit.append('deaths/1k population')
|
||||
|
||||
elif( key.find('mortality') >=0 and
|
||||
key.find('rate') >=0 and
|
||||
key.find('infant') >=0 ):
|
||||
key_unit.append('deaths/1k live births')
|
||||
|
||||
elif( key.find('mortality') >=0 and
|
||||
key.find('rate') >=0 and
|
||||
key.find('maternal') >=0 ):
|
||||
key_unit.append('deaths/100k live births')
|
||||
|
||||
elif( key.find('suicide') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('deaths/100k population')
|
||||
|
||||
elif( key.find('life') >=0 and
|
||||
key.find('expectancy') >=0 ):
|
||||
key_unit.append('years')
|
||||
|
||||
elif( key.find('birth') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('births/1k population')
|
||||
|
||||
elif( key.find('fertility') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('children/women')
|
||||
|
||||
elif( key.find('marriage') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('marriages/1k population')
|
||||
|
||||
elif( key.find('divorce') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('divorces/1k population')
|
||||
|
||||
elif( key.find('crime') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('crimes/100k population')
|
||||
|
||||
elif( key.find('murder') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('murders/100k population')
|
||||
|
||||
elif( key.find('military') >=0 and
|
||||
key.find('size') >=0 ):
|
||||
key_unit.append('personel')
|
||||
|
||||
elif( key.find('immigration') >=0 or
|
||||
key.find('migration') >=0 or
|
||||
key.find('refugee') >=0 ):
|
||||
key_unit.append('people')
|
||||
|
||||
elif( key.find('emissions') >=0 ):
|
||||
key_unit.append('tonnes')
|
||||
|
||||
else:
|
||||
key_unit.append('unkown')
|
||||
self.data_info.append( key_unit )
|
||||
|
||||
# Process Variable Multiplyer
|
||||
key_multiplyer = []
|
||||
for key in self.data_info[0]:
|
||||
|
||||
if( key.find('%') >=0 or key.find('percent') >=0 ):
|
||||
key_multiplyer.append( 0.01 )
|
||||
|
||||
elif( re.search('\$.*k', key) ): key_multiplyer.append(1000)
|
||||
elif( re.search('\$.*m', key) ): key_multiplyer.append(1000000)
|
||||
elif( re.search('\$.*b', key) ): key_multiplyer.append(1000000000)
|
||||
|
||||
elif( key.find('thousands of') >=0 ):
|
||||
key_multiplyer.append(1000)
|
||||
elif( key.find('millions of') >=0 ):
|
||||
key_multiplyer.append(1000000)
|
||||
elif( key.find('bilions of') >=0 ):
|
||||
key_multiplyer.append(1000000000)
|
||||
|
||||
elif( key.find('mi2') >=0 or key.find('mi²') >=0 ):
|
||||
key_multiplyer.append(2.59)
|
||||
elif( key.find('hectare') >=0 ):
|
||||
key_multiplyer.append(0.01)
|
||||
|
||||
else:
|
||||
key_multiplyer.append( 1.0 )
|
||||
self.data_info.append( key_multiplyer )
|
||||
|
||||
|
||||
# Get Variable Year
|
||||
key_year = []
|
||||
for key in self.data[0]:
|
||||
if re.match('\d\d\d\d', key):
|
||||
key_year.append( key )
|
||||
elif 'year' in self.data_info[1]:
|
||||
y1 = self.data[1][self.data_info[1].index('year')]
|
||||
y2 = self.data[-1][self.data_info[1].index('year')]
|
||||
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
|
||||
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
|
||||
elif 'date' in self.data_info[1]:
|
||||
y1 = self.data[1][self.data_info[1].index('date')].split('-')[0]
|
||||
y2 = self.data[-1][self.data_info[1].index('date')].split('-')[0]
|
||||
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
|
||||
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
|
||||
|
||||
else:
|
||||
key_year.append( date.today().strftime('%Y') )
|
||||
self.data_info.append( key_year )
|
||||
|
||||
|
||||
# Get Variable Type
|
||||
for key in self.data_info[1]:
|
||||
if key == 'country.name': self.meta['type'] = 'global'
|
||||
elif key == 'year': self.meta['type'] = 'historical'
|
||||
elif key == 'date': self.meta['type'] = 'historical'
|
||||
elif key == 'us.county.fips': self.meta['type'] = 'regional'
|
||||
elif key == 'uk.constituency.name': self.meta['type'] = 'regional'
|
||||
if self.meta['type'] == None: self.meta['type'] = 'unkown'
|
||||
|
||||
|
||||
# Get Variable Scope
|
||||
key_scope = []
|
||||
for key in self.data_info[1]:
|
||||
if key.find('male') >=0: key_scope.append( 'male' )
|
||||
elif key.find('female') >=0: key_scope.append( 'female' )
|
||||
elif key.find('black') >=0: key_scope.append( 'black' )
|
||||
elif key.find('white') >=0: key_scope.append( 'white' )
|
||||
elif key.find('asian') >=0: key_scope.append( 'asian' )
|
||||
elif key.find('native') >=0: key_scope.append( 'native' )
|
||||
elif key.find('urban') >=0: key_scope.append( 'urban' )
|
||||
elif key.find('rural') >=0: key_scope.append( 'rural' )
|
||||
else: key_scope.append( self.meta['type'] )
|
||||
self.data_info.append( key_scope )
|
||||
|
||||
|
||||
# Get Variable Category
|
||||
search = self.meta['name'].join(self.data_info[0]).lower().strip()
|
||||
|
||||
|
||||
#--------[ Geographic ]--------#
|
||||
if( search.find('area') >=0 or
|
||||
search.find('km2') >=0 ):
|
||||
self.meta['category'] = 'geographic'
|
||||
self.meta['subcategory'] = 'area'
|
||||
|
||||
elif( (search.find('arable') >=0 or
|
||||
search.find('farm') >=0 or
|
||||
search.find('forrested') >=0) and
|
||||
search.find('land') >=0 ):
|
||||
self.meta['category'] = 'geographic'
|
||||
self.meta['subcategory'] = 'land'
|
||||
|
||||
|
||||
#--------[ Demographic ]--------
|
||||
elif( search.find('population') >=0 and
|
||||
search.find('access') <0 and
|
||||
search.find('murder') <0 and
|
||||
search.find('crime') <0 and
|
||||
search.find('hunger') <0 and
|
||||
search.find('migrat') <0 and
|
||||
search.find('migrant') <0 ):
|
||||
self.meta['category'] = 'demogrpahic'
|
||||
self.meta['subcategory'] = 'population'
|
||||
|
||||
elif( (search.find('birth') >=0 or
|
||||
search.find('fertility') >=0) and
|
||||
search.find('mortality') <0 ):
|
||||
self.meta['category'] = 'demogrpahic'
|
||||
self.meta['subcategory'] = 'fertility'
|
||||
|
||||
elif( search.find('immigrat') >=0 or
|
||||
search.find('migrat') >=0 or
|
||||
search.find('migrant') >=0 or
|
||||
search.find('refugee') >=0 or
|
||||
search.find('asylum') >=0 ):
|
||||
self.meta['category'] = 'demogrpahic'
|
||||
self.meta['subcategory'] = 'migration'
|
||||
|
||||
|
||||
#--------[ Health ]--------#
|
||||
elif( search.find('life expectancy') >=0 or
|
||||
search.find('death') >=0 or
|
||||
search.find('suicide') >=0 or
|
||||
search.find('mortality') >=0 ):
|
||||
self.meta['category'] = 'health'
|
||||
self.meta['subcategory'] = 'mortality'
|
||||
|
||||
elif( search.find('depression') >=0 or
|
||||
search.find('anxiety') >=0 ):
|
||||
self.meta['category'] = 'health'
|
||||
self.meta['subcategory'] = 'psychology'
|
||||
|
||||
elif( search.find('smoking') >= 0 or
|
||||
search.find('alcohol') >=0 ):
|
||||
self.meta['category'] = 'health'
|
||||
self.meta['subcategory'] = 'drugs'
|
||||
|
||||
|
||||
#--------[ Economic ]--------#
|
||||
elif( search.find('gdp') >=0 and
|
||||
search.find('trade') <0 and
|
||||
search.find('import') <0 and
|
||||
search.find('export') <0 and
|
||||
search.find('invest') <0 and
|
||||
search.find('spending') <0 and
|
||||
search.find('manufactur') <0 and
|
||||
search.find('military') <0 and
|
||||
search.find('education') <0 and
|
||||
search.find('health') <0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'gdp'
|
||||
|
||||
elif( search.find('gni') >=0 or
|
||||
search.find('gnp') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'gni'
|
||||
|
||||
elif( search.find('debt') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'debt'
|
||||
|
||||
elif( search.find('inflation') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'inflation'
|
||||
|
||||
elif( search.find('health') >=0 and
|
||||
search.find('spend') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'welfare'
|
||||
|
||||
elif( search.find('manufactur') >=0 or
|
||||
search.find('business') >=0 or
|
||||
search.find('tourism') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'business'
|
||||
|
||||
elif( search.find('import') >=0 or
|
||||
search.find('export') >=0 or
|
||||
search.find('invest') >=0 or
|
||||
search.find('tariff') >=0 or
|
||||
search.find('trade') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'trade'
|
||||
|
||||
elif( search.find('unemployment') >=0 or
|
||||
search.find('labor') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'labor-force'
|
||||
|
||||
|
||||
#--------[ Development ]--------#
|
||||
elif( search.find('education') >=0 or
|
||||
search.find('literacy') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'education'
|
||||
|
||||
elif( search.find('electricity access') >=0 or
|
||||
search.find('water access') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'infrastructure'
|
||||
|
||||
elif( search.find('development') >=0 or
|
||||
search.find('competitive') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'technology'
|
||||
|
||||
elif( search.find('hunger') >=0 or
|
||||
search.find('poverty') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'quality-of-life'
|
||||
|
||||
elif( search.find('co2') >=0 or
|
||||
search.find('ghg') >=0 or
|
||||
search.find('emissions') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'emissions'
|
||||
|
||||
elif( search.find('fuel') >=0 or
|
||||
search.find('coal') >=0 or
|
||||
search.find('energy') >=0 or
|
||||
search.find('renewable') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'energy'
|
||||
|
||||
|
||||
#--------[ Crime ]--------#
|
||||
elif( search.find('crime') >=0 or
|
||||
search.find('homocide') >=0 or
|
||||
search.find('murder') >=0 ):
|
||||
self.meta['category'] = 'crime'
|
||||
|
||||
|
||||
#--------[ Military ]--------#
|
||||
elif( search.find('military') >=0 ):
|
||||
self.meta['category'] = 'military'
|
||||
|
||||
|
||||
#--------[ Uncategorised ]--------#
|
||||
else:
|
||||
self.meta['category'] = 'uncategorised'
|
||||
|
||||
|
||||
# Get Tags
|
||||
if not self.meta['type'] in self.meta['tags']: self.meta['tags'].append(self.meta['type'])
|
||||
if not self.meta['category'] in self.meta['tags']: self.meta['tags'].append(self.meta['category'])
|
||||
if not self.meta['subcategory'] in self.meta['tags']: self.meta['tags'].append(self.meta['subcategory'])
|
||||
for scope in key_scope:
|
||||
if not scope in self.meta['tags']:
|
||||
self.meta['tags'].append(scope)
|
||||
if scope == 'female' or scope == 'male':
|
||||
self.meta['tags'].append('gender')
|
||||
if scope == 'black' or scope == 'white' or scope == 'asian' or scope == 'native':
|
||||
self.meta['tags'].append('race')
|
||||
|
||||
if 'None' in self.meta['tags']:
|
||||
self.meta['tags'].pop( self.meta['tags'].index('None') )
|
||||
|
||||
|
||||
|
||||
#--------[ Clean Scrape Data ]--------#
|
||||
def clean(self):
|
||||
|
||||
# Break if scrape contains no data
|
||||
if len(self.data) <= 1: return(1)
|
||||
|
||||
for x in range(1, len(self.data)):
|
||||
for y in range(0, len(self.data[x])):
|
||||
self.data[x][y] = self.data[x][y]
|
||||
|
||||
# Remove any inline notes from data
|
||||
if isinstance(self.data[x][y], str):
|
||||
self.data[x][y] = re.sub('\[.*\]','', self.data[x][y])
|
||||
self.data[x][y] = re.sub('\(.*\)','', self.data[x][y])
|
||||
self.data[x][y] = re.sub(',','', self.data[x][y])
|
||||
|
||||
# Convert numerical strings to floats
|
||||
if isinstance(self.data[x][y], str):
|
||||
self.data[x][y] = self.data[x][y].strip()
|
||||
if any(i.isdigit() for i in self.data[x][y]):
|
||||
self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.' or i=='-'])
|
||||
|
||||
# Convert To Float
|
||||
if isfloat(self.data[x][y]):
|
||||
self.data[x][y] = float(self.data[x][y])
|
||||
|
||||
# Apply Variable Multiplyer
|
||||
self.data[x][y] = self.data[x][y] * self.data_info[3][y]
|
||||
|
||||
# Convert Whole Floats To Integers
|
||||
if self.data[x][y].is_integer():
|
||||
self.data[x][y] = int(self.data[x][y])
|
||||
|
||||
# Convert non-entries to null
|
||||
if isinstance(self.data[x][y], str):
|
||||
if( self.data[x][y].lower().find('not determined') >= 0 or
|
||||
self.data[x][y].lower().find('negligible') >=0 or
|
||||
self.data[x][y].lower().find('negligible') >=0 or
|
||||
self.data[x][y].lower().find('unkown') >= 0 ):
|
||||
self.data[x][y] = None
|
||||
if( self.data[x][y] == '-' or
|
||||
self.data[x][y] == '' ):
|
||||
self.data[x][y] = None
|
||||
|
||||
|
||||
|
||||
#--------[ Save Scrape Data ]--------#
|
||||
def save(self):
|
||||
|
||||
# Break if scrape contains no data
|
||||
if len(self.data) <= 1: return(1)
|
||||
|
||||
key_main = 0
|
||||
for i in range(0, len(self.data_info[1])):
|
||||
if( self.data[0][i] == 'country.name' >= 0 or
|
||||
self.data[0][i] == 'year' >= 0 ):
|
||||
key_main = i
|
||||
|
||||
|
||||
for key_data in range(0, len(self.data[0])):
|
||||
if key_data != key_main:
|
||||
|
||||
|
||||
#--------[ Generate Filename ]--------#
|
||||
filename = self.data_info[1][key_data].replace('.','-')
|
||||
|
||||
filepath = 'data/{0}'.format(self.meta['type'])
|
||||
if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-')
|
||||
filepath += '/{0}'.format(self.meta['category'])
|
||||
if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory']
|
||||
if len(self.data[0]) > 4:
|
||||
filepath += '/' + self.meta['name'].lower().replace(' ','-')
|
||||
|
||||
fullpath = filepath + '/' + filename + '.json'
|
||||
|
||||
|
||||
#--------[ Check File Directory ]--------#
|
||||
if not os.path.exists(filepath):
|
||||
os.makedirs(filepath)
|
||||
|
||||
|
||||
#--------[ Open File ]--------#
|
||||
f = open(fullpath, "w")
|
||||
f.write('{\n')
|
||||
|
||||
|
||||
#--------[ Update Metadata ]--------#
|
||||
self.meta['units'] = self.data_info[2][key_data]
|
||||
self.meta['year'] = self.data_info[4][key_data]
|
||||
|
||||
if self.meta['scope'] == None:
|
||||
self.meta['scope'] = self.data_info[5][key_data]
|
||||
|
||||
#--------[ Write Metadata ]
|
||||
f.write(' "metadata" : {\n')
|
||||
for i in self.meta:
|
||||
if isinstance(self.meta[i], str):
|
||||
f.write(' "{0}" : "{1}"'.format( i, self.meta[i] ))
|
||||
elif self.meta[i] == None:
|
||||
f.write(' "{0}" : null'.format( i ))
|
||||
elif isinstance(self.meta[i], list):
|
||||
if len(self.meta[i]) <= 0:
|
||||
f.write(' "{0}" : []'.format( i ))
|
||||
elif i == 'tags':
|
||||
f.write(' "{0}" : ['.format( i ))
|
||||
for j in self.meta[i]:
|
||||
f.write('"{0}"'.format( j ))
|
||||
if j != self.meta[i][-1]: f.write(',')
|
||||
f.write(']'.format( i ))
|
||||
else:
|
||||
f.write(' "{0}" : [\n'.format( i ))
|
||||
for j in self.meta[i]:
|
||||
f.write(' "{0}"'.format( j ))
|
||||
if j != self.meta[i][-1]: f.write(',\n')
|
||||
else: f.write('\n')
|
||||
f.write(' ]'.format( i ))
|
||||
if i != list(self.meta.keys())[-1]: f.write(',\n')
|
||||
else: f.write('\n')
|
||||
f.write(' },\n')
|
||||
|
||||
|
||||
|
||||
#--------[ Write Actual Data ]--------#
|
||||
f.write(' "data" : [\n')
|
||||
|
||||
if self.meta['type'] == 'historical':
|
||||
f.write(' ["{0}","{1}"],\n'.format(
|
||||
self.data_info[1][key_main],
|
||||
self.meta['id'] + '.' + self.data_info[1][key_data])
|
||||
)
|
||||
else:
|
||||
f.write(' ["{0}","{1}"],\n'.format(
|
||||
self.data_info[1][key_main],
|
||||
self.data_info[1][key_data])
|
||||
)
|
||||
|
||||
for row in self.data[1:]:
|
||||
col_a = row[key_main]
|
||||
col_b = row[key_data]
|
||||
|
||||
if isinstance(col_a, str): col_a = '"{0}"'.format(col_a)
|
||||
if isinstance(col_b, str): col_b = '"{0}"'.format(col_b)
|
||||
|
||||
if col_a == None: col_a = 'null'
|
||||
if col_b == None: col_b = 'null'
|
||||
|
||||
f.write(' [{0},{1}]'.format(col_a, col_b))
|
||||
|
||||
if row != self.data[-1]: f.write(',\n')
|
||||
else: f.write('\n')
|
||||
f.write(' ]\n')
|
||||
|
||||
|
||||
|
||||
#--------[ Final Result ]--------#
|
||||
f.write('}\n')
|
||||
f.close()
|
||||
print(' [{0} data points] -> {1}'.format(len(self.data)-1, fullpath))
|
||||
if len(self.data) <= 1: return(-1) # Break if no data
|
||||
|
||||
self.meta_name() # Clean set name
|
||||
self.meta_search() # Create search-space
|
||||
self.meta_keys() # Extract variable key-name
|
||||
self.meta_year() # Extract variable year
|
||||
self.meta_units() # Extract variable unit
|
||||
self.meta_multiplyer() # Extract variable multiplyer
|
||||
self.meta_scope() # Extract variable scope
|
||||
self.meta_category() # Extract set category
|
||||
self.meta_type() # Extract set type
|
||||
self.meta_tags() # Extract set tag
|
||||
|
|
56
masterscraper/core/clean.py
Normal file
56
masterscraper/core/clean.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
import re
|
||||
|
||||
# Check If String Is Number
|
||||
def isfloat(num):
|
||||
try:
|
||||
float(num)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
#--------[ Clean Scrape Data ]--------#
|
||||
def clean(self):
|
||||
if len(self.data) <= 1: return(-1) # Break if no data
|
||||
|
||||
|
||||
for x in range(1, len(self.data)):
|
||||
for y in range(0, len(self.data[x])):
|
||||
self.data[x][y] = self.data[x][y]
|
||||
|
||||
# Remove any inline notes from data
|
||||
if isinstance(self.data[x][y], str):
|
||||
self.data[x][y] = re.sub('\[.*\]','', self.data[x][y])
|
||||
self.data[x][y] = re.sub('\(.*\)','', self.data[x][y])
|
||||
self.data[x][y] = re.sub(',','', self.data[x][y])
|
||||
|
||||
# Convert numerical strings to floats
|
||||
if isinstance(self.data[x][y], str):
|
||||
self.data[x][y] = self.data[x][y].strip()
|
||||
if any(i.isdigit() for i in self.data[x][y]):
|
||||
self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.' or i=='-'])
|
||||
|
||||
# Convert To Float
|
||||
if isfloat(self.data[x][y]):
|
||||
self.data[x][y] = float(self.data[x][y])
|
||||
|
||||
# Apply Variable Multiplyer
|
||||
self.data[x][y] = self.data[x][y] * self.info['multiplyer'][y]
|
||||
|
||||
# Convert Whole Floats To Integers
|
||||
if self.data[x][y].is_integer():
|
||||
self.data[x][y] = int(self.data[x][y])
|
||||
|
||||
# Convert non-entries to null
|
||||
if isinstance(self.data[x][y], str):
|
||||
if( self.data[x][y].lower().find('not determined') >= 0 or
|
||||
self.data[x][y].lower().find('negligible') >=0 or
|
||||
self.data[x][y].lower().find('negligible') >=0 or
|
||||
self.data[x][y].lower().find('unkown') >= 0 ):
|
||||
self.data[x][y] = None
|
||||
if( self.data[x][y] == '-' or
|
||||
self.data[x][y] == '' ):
|
||||
self.data[x][y] = None
|
10
masterscraper/core/get_list.py
Normal file
10
masterscraper/core/get_list.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
|
||||
#--------[ Load URL Scrape List ]--------#
|
||||
def get_list(filename):
|
||||
f = open(filename, 'r')
|
||||
tmp_list = [l.strip() for l in f.readlines()]
|
||||
f.close()
|
||||
return(tmp_list)
|
169
masterscraper/core/meta_category.py
Normal file
169
masterscraper/core/meta_category.py
Normal file
|
@ -0,0 +1,169 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
#--------[ Extract Category Information ]--------#
|
||||
def meta_category(self):
|
||||
search = self.meta['name'].join(self.info['search']).lower().strip()
|
||||
|
||||
|
||||
#--------[ Geographic ]--------#
|
||||
if( search.find('area') >=0 or
|
||||
search.find('km2') >=0 ):
|
||||
self.meta['category'] = 'geographic'
|
||||
self.meta['subcategory'] = 'area'
|
||||
|
||||
elif( (search.find('arable') >=0 or
|
||||
search.find('farm') >=0 or
|
||||
search.find('forrested') >=0) and
|
||||
search.find('land') >=0 ):
|
||||
self.meta['category'] = 'geographic'
|
||||
self.meta['subcategory'] = 'land'
|
||||
|
||||
|
||||
#--------[ Demographic ]--------
|
||||
elif( search.find('population') >=0 and
|
||||
search.find('access') <0 and
|
||||
search.find('murder') <0 and
|
||||
search.find('crime') <0 and
|
||||
search.find('hunger') <0 and
|
||||
search.find('migrat') <0 and
|
||||
search.find('migrant') <0 ):
|
||||
self.meta['category'] = 'demogrpahic'
|
||||
self.meta['subcategory'] = 'population'
|
||||
|
||||
elif( (search.find('birth') >=0 or
|
||||
search.find('fertility') >=0) and
|
||||
search.find('mortality') <0 ):
|
||||
self.meta['category'] = 'demogrpahic'
|
||||
self.meta['subcategory'] = 'fertility'
|
||||
|
||||
elif( search.find('immigrat') >=0 or
|
||||
search.find('migrat') >=0 or
|
||||
search.find('migrant') >=0 or
|
||||
search.find('refugee') >=0 or
|
||||
search.find('asylum') >=0 ):
|
||||
self.meta['category'] = 'demogrpahic'
|
||||
self.meta['subcategory'] = 'migration'
|
||||
|
||||
|
||||
#--------[ Health ]--------#
|
||||
elif( search.find('life expectancy') >=0 or
|
||||
search.find('death') >=0 or
|
||||
search.find('suicide') >=0 or
|
||||
search.find('mortality') >=0 ):
|
||||
self.meta['category'] = 'health'
|
||||
self.meta['subcategory'] = 'mortality'
|
||||
|
||||
elif( search.find('depression') >=0 or
|
||||
search.find('anxiety') >=0 ):
|
||||
self.meta['category'] = 'health'
|
||||
self.meta['subcategory'] = 'psychology'
|
||||
|
||||
elif( search.find('smoking') >= 0 or
|
||||
search.find('alcohol') >=0 ):
|
||||
self.meta['category'] = 'health'
|
||||
self.meta['subcategory'] = 'drugs'
|
||||
|
||||
|
||||
#--------[ Economic ]--------#
|
||||
elif( search.find('gdp') >=0 and
|
||||
search.find('trade') <0 and
|
||||
search.find('import') <0 and
|
||||
search.find('export') <0 and
|
||||
search.find('invest') <0 and
|
||||
search.find('spending') <0 and
|
||||
search.find('manufactur') <0 and
|
||||
search.find('military') <0 and
|
||||
search.find('education') <0 and
|
||||
search.find('health') <0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'gdp'
|
||||
|
||||
elif( search.find('gni') >=0 or
|
||||
search.find('gnp') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'gni'
|
||||
|
||||
elif( search.find('debt') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'debt'
|
||||
|
||||
elif( search.find('inflation') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'inflation'
|
||||
|
||||
elif( search.find('health') >=0 and
|
||||
search.find('spend') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'welfare'
|
||||
|
||||
elif( search.find('manufactur') >=0 or
|
||||
search.find('business') >=0 or
|
||||
search.find('tourism') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'business'
|
||||
|
||||
elif( search.find('import') >=0 or
|
||||
search.find('export') >=0 or
|
||||
search.find('invest') >=0 or
|
||||
search.find('tariff') >=0 or
|
||||
search.find('trade') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'trade'
|
||||
|
||||
elif( search.find('unemployment') >=0 or
|
||||
search.find('labor') >=0 ):
|
||||
self.meta['category'] = 'economic'
|
||||
self.meta['subcategory'] = 'labor-force'
|
||||
|
||||
|
||||
#--------[ Development ]--------#
|
||||
elif( search.find('education') >=0 or
|
||||
search.find('literacy') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'education'
|
||||
|
||||
elif( search.find('electricity access') >=0 or
|
||||
search.find('water access') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'infrastructure'
|
||||
|
||||
elif( search.find('development') >=0 or
|
||||
search.find('competitive') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'technology'
|
||||
|
||||
elif( search.find('hunger') >=0 or
|
||||
search.find('poverty') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'quality-of-life'
|
||||
|
||||
elif( search.find('co2') >=0 or
|
||||
search.find('ghg') >=0 or
|
||||
search.find('emissions') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'emissions'
|
||||
|
||||
elif( search.find('fuel') >=0 or
|
||||
search.find('coal') >=0 or
|
||||
search.find('energy') >=0 or
|
||||
search.find('renewable') >=0 ):
|
||||
self.meta['category'] = 'development'
|
||||
self.meta['subcategory'] = 'energy'
|
||||
|
||||
|
||||
#--------[ Crime ]--------#
|
||||
elif( search.find('crime') >=0 or
|
||||
search.find('homocide') >=0 or
|
||||
search.find('murder') >=0 ):
|
||||
self.meta['category'] = 'crime'
|
||||
|
||||
|
||||
#--------[ Military ]--------#
|
||||
elif( search.find('military') >=0 ):
|
||||
self.meta['category'] = 'military'
|
||||
|
||||
|
||||
#--------[ Uncategorised ]--------#
|
||||
else:
|
||||
self.meta['category'] = 'uncategorised'
|
58
masterscraper/core/meta_keys.py
Normal file
58
masterscraper/core/meta_keys.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
#--------[ Process Variable Key Names ]--------
|
||||
def meta_keys(self):
|
||||
key_name = []
|
||||
|
||||
for key in self.data[0]:
|
||||
if(key.lower().find('country') >=0 or
|
||||
key.lower().find('countries') >=0 or
|
||||
key.lower().find('dependency') >=0 ):
|
||||
key_name.append('country.name')
|
||||
elif(key.lower().find('year') >=0):
|
||||
key_name.append('year')
|
||||
elif(key.lower().find('date') >=0):
|
||||
key_name.append('date')
|
||||
else:
|
||||
|
||||
tmp_key = key
|
||||
tmp_key = tmp_key.lower()
|
||||
|
||||
tmp_key = re.sub(',', '', tmp_key)
|
||||
tmp_key = re.sub('\[.*\]', '', tmp_key)
|
||||
tmp_key = re.sub('\(.*\)', '', tmp_key)
|
||||
tmp_key = re.sub('km2', '', tmp_key)
|
||||
tmp_key = re.sub('km', '', tmp_key)
|
||||
tmp_key = re.sub('mi2', '', tmp_key)
|
||||
tmp_key = re.sub('hectares', '', tmp_key)
|
||||
tmp_key = re.sub('\ in\ ', '', tmp_key)
|
||||
tmp_key = re.sub('US\ \$', '', tmp_key)
|
||||
tmp_key = re.sub('\$', 'dollars', tmp_key)
|
||||
tmp_key = re.sub('\%', 'percent', tmp_key)
|
||||
|
||||
tmp_key = re.sub('and\ dependencies ', '', tmp_key)
|
||||
tmp_key = re.sub('list\ of\ countries\ by\ ', '', tmp_key)
|
||||
|
||||
tmp_key = re.sub('thousands\ of', '' ,tmp_key)
|
||||
tmp_key = re.sub('millions\ of', '' ,tmp_key)
|
||||
tmp_key = re.sub('billions\ of', '' ,tmp_key)
|
||||
|
||||
tmp_key = re.sub('per\ 100k\ live\ births', '', tmp_key)
|
||||
tmp_key = re.sub('per\ 100k\ population', '', tmp_key)
|
||||
|
||||
tmp_key = tmp_key.strip()
|
||||
tmp_key = tmp_key.replace(' ','.')
|
||||
|
||||
if tmp_key.find(self.meta['name'].lower().replace(' ','.')) <0:
|
||||
if tmp_key != '':
|
||||
tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key
|
||||
else:
|
||||
tmp_key = self.meta['name'].lower().replace(' ','.')
|
||||
|
||||
#--------[ Add Name To Info Array ]--------#
|
||||
key_name.append( tmp_key )
|
||||
self.info['keys'] = key_name
|
36
masterscraper/core/meta_multiplyer.py
Normal file
36
masterscraper/core/meta_multiplyer.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
|
||||
#--------[ Process Variable Multiplyer
|
||||
def meta_multiplyer(self):
|
||||
key_multiplyer = []
|
||||
for key in self.info['search']:
|
||||
|
||||
if( key.find('%') >=0 or key.find('percent') >=0 ):
|
||||
key_multiplyer.append( 0.01 )
|
||||
|
||||
elif( re.search('\$.*k', key) ): key_multiplyer.append(1000)
|
||||
elif( re.search('\$.*m', key) ): key_multiplyer.append(1000000)
|
||||
elif( re.search('\$.*b', key) ): key_multiplyer.append(1000000000)
|
||||
|
||||
elif( key.find('thousands of') >=0 ):
|
||||
key_multiplyer.append(1000)
|
||||
elif( key.find('millions of') >=0 ):
|
||||
key_multiplyer.append(1000000)
|
||||
elif( key.find('bilions of') >=0 ):
|
||||
key_multiplyer.append(1000000000)
|
||||
|
||||
elif( key.find('mi2') >=0 or key.find('mi²') >=0 ):
|
||||
key_multiplyer.append(2.59)
|
||||
elif( key.find('hectare') >=0 ):
|
||||
key_multiplyer.append(0.01)
|
||||
|
||||
else:
|
||||
key_multiplyer.append( 1.0 )
|
||||
|
||||
self.info['multiplyer'] = key_multiplyer
|
22
masterscraper/core/meta_name.py
Normal file
22
masterscraper/core/meta_name.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
|
||||
import re
|
||||
|
||||
|
||||
|
||||
#--------[ Process Vaiable Set Names ]-------#-
|
||||
def meta_name(self):
|
||||
self.meta['name'] = self.meta['name'].lower()
|
||||
self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name'])
|
||||
self.meta['name'] = re.sub('list\ of\ ','',self.meta['name'])
|
||||
self.meta['name'] = re.sub(',','',self.meta['name'])
|
||||
self.meta['name'] = self.meta['name'].strip()
|
||||
self.meta['name'] = self.meta['name'].title()
|
||||
|
||||
self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP')
|
||||
self.meta['name'] = self.meta['name'].replace('Gni', 'GNI')
|
||||
self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP')
|
||||
|
||||
|
17
masterscraper/core/meta_scope.py
Normal file
17
masterscraper/core/meta_scope.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
#--------[ Get Variable Scope ]--------#
|
||||
def meta_scope(self):
|
||||
key_scope = []
|
||||
for key in self.info['search']:
|
||||
if key.find('female') >=0: key_scope.append( 'female' )
|
||||
elif key.find('male') >=0: key_scope.append( 'male' )
|
||||
elif key.find('black') >=0: key_scope.append( 'black' )
|
||||
elif key.find('white') >=0: key_scope.append( 'white' )
|
||||
elif key.find('asian') >=0: key_scope.append( 'asian' )
|
||||
elif key.find('native') >=0: key_scope.append( 'native' )
|
||||
elif key.find('urban') >=0: key_scope.append( 'urban' )
|
||||
elif key.find('rural') >=0: key_scope.append( 'rural' )
|
||||
else: key_scope.append( self.meta['type'] )
|
||||
self.info['scope'] = key_scope
|
15
masterscraper/core/meta_search.py
Normal file
15
masterscraper/core/meta_search.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
#--------[ Create Variable Search Space ]--------#
|
||||
def meta_search(self):
|
||||
key_search = []
|
||||
|
||||
for i in range(0, len(self.data[0])):
|
||||
key_search.append(
|
||||
self.meta['name'].lower() + ' ' +
|
||||
self.data[0][i].lower() + ' ' +
|
||||
self.data[1][i].lower()
|
||||
)
|
||||
|
||||
self.info['search'] = key_search
|
24
masterscraper/core/meta_tags.py
Normal file
24
masterscraper/core/meta_tags.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
#--------[ Extract Tag Information ]--------#
|
||||
def meta_tags(self):
|
||||
if not self.meta['type'] in self.meta['tags']: self.meta['tags'].append(self.meta['type'])
|
||||
if not self.meta['category'] in self.meta['tags']: self.meta['tags'].append(self.meta['category'])
|
||||
if not self.meta['subcategory'] in self.meta['tags']: self.meta['tags'].append(self.meta['subcategory'])
|
||||
|
||||
if self.meta['scope'] != None:
|
||||
if not self.meta['scope'].lower() in self.meta['tags']:
|
||||
self.meta['tags'].append( self.meta['scope'].lower() )
|
||||
|
||||
for scope in self.info['scope']:
|
||||
if not scope in self.meta['tags']:
|
||||
self.meta['tags'].append(scope)
|
||||
if scope == 'female' or scope == 'male':
|
||||
self.meta['tags'].append('gender')
|
||||
if scope == 'black' or scope == 'white' or scope == 'asian' or scope == 'native':
|
||||
self.meta['tags'].append('race')
|
||||
|
||||
if None in self.meta['tags']:
|
||||
self.meta['tags'].pop( self.meta['tags'].index(None) )
|
||||
if 'None' in self.meta['tags']:
|
||||
self.meta['tags'].pop( self.meta['tags'].index('None') )
|
12
masterscraper/core/meta_type.py
Normal file
12
masterscraper/core/meta_type.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
#--------[ Get Variable Type ]--------#
|
||||
def meta_type(self):
|
||||
for key in self.info['keys']:
|
||||
if key == 'country.name': self.meta['type'] = 'global'
|
||||
elif key == 'year': self.meta['type'] = 'historical'
|
||||
elif key == 'date': self.meta['type'] = 'historical'
|
||||
elif key == 'us.county.fips': self.meta['type'] = 'regional'
|
||||
elif key == 'uk.constituency.name': self.meta['type'] = 'regional'
|
||||
if self.meta['type'] == None: self.meta['type'] = 'unkown'
|
117
masterscraper/core/meta_units.py
Normal file
117
masterscraper/core/meta_units.py
Normal file
|
@ -0,0 +1,117 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
#--------[ Process Variable Unit Type ]--------#
|
||||
|
||||
def meta_units(self):
|
||||
key_unit = []
|
||||
for key in self.info['search']:
|
||||
|
||||
if( key.find('percent') >=0 or
|
||||
key.find('perc') >=0 or
|
||||
key.find('%') >=0 ):
|
||||
key_unit.append('%')
|
||||
|
||||
elif( key.find('dollar') >=0 or
|
||||
key.find('$') >=0 ):
|
||||
key_unit.append('$')
|
||||
|
||||
elif( key.find('euro') >=0 or
|
||||
key.find('€') >=0 ):
|
||||
key_unit.append('€')
|
||||
|
||||
elif( key.find('area') >=0 or
|
||||
key.find('land') >=0 or
|
||||
key.find('km2') >=0 or
|
||||
key.find('km²') >=0 or
|
||||
key.find('mi2') >=0 or
|
||||
key.find('mi²') >=0 or
|
||||
key.find('ha') >=0 or
|
||||
key.find('hectares') >=0 ):
|
||||
key_unit.append('km²')
|
||||
|
||||
elif( key.find('country') >=0 or
|
||||
key.find('countries') >=0 or
|
||||
key.find('dependencies') >=0 ):
|
||||
key_unit.append('countries')
|
||||
|
||||
elif( key.find('index') >=0 or
|
||||
key.find('score') >=0 or
|
||||
key.find('report') >=0 ):
|
||||
key_unit.append('index')
|
||||
|
||||
elif( key.find('population') >=0 and
|
||||
key.find('density') <0 and
|
||||
key.find('access') <0 and
|
||||
key.find('crime') <0 and
|
||||
key.find('murder') <0 ):
|
||||
key_unit.append('people')
|
||||
|
||||
elif( key.find('population') >=0 and
|
||||
key.find('density') >=0 ):
|
||||
key_unit.append('people/km²')
|
||||
|
||||
elif( (key.find('death') >=0 or
|
||||
key.find('mortality') >=0) and
|
||||
key.find('rate') >=0 and
|
||||
key.find('infant') <0 and
|
||||
key.find('maternal') <0 ):
|
||||
key_unit.append('deaths/1k population')
|
||||
|
||||
elif( key.find('mortality') >=0 and
|
||||
key.find('rate') >=0 and
|
||||
key.find('infant') >=0 ):
|
||||
key_unit.append('deaths/1k live births')
|
||||
|
||||
elif( key.find('mortality') >=0 and
|
||||
key.find('rate') >=0 and
|
||||
key.find('maternal') >=0 ):
|
||||
key_unit.append('deaths/100k live births')
|
||||
|
||||
elif( key.find('suicide') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('deaths/100k population')
|
||||
|
||||
elif( key.find('life') >=0 and
|
||||
key.find('expectancy') >=0 ):
|
||||
key_unit.append('years')
|
||||
|
||||
elif( key.find('birth') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('births/1k population')
|
||||
|
||||
elif( key.find('fertility') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('children/women')
|
||||
|
||||
elif( key.find('marriage') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('marriages/1k population')
|
||||
|
||||
elif( key.find('divorce') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('divorces/1k population')
|
||||
|
||||
elif( key.find('crime') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('crimes/100k population')
|
||||
|
||||
elif( key.find('murder') >=0 and
|
||||
key.find('rate') >=0 ):
|
||||
key_unit.append('murders/100k population')
|
||||
|
||||
elif( key.find('military') >=0 and
|
||||
key.find('size') >=0 ):
|
||||
key_unit.append('personel')
|
||||
|
||||
elif( key.find('immigration') >=0 or
|
||||
key.find('migration') >=0 or
|
||||
key.find('refugee') >=0 ):
|
||||
key_unit.append('people')
|
||||
|
||||
elif( key.find('emissions') >=0 ):
|
||||
key_unit.append('tonnes')
|
||||
|
||||
else:
|
||||
key_unit.append('unkown')
|
||||
|
||||
self.info['units'] = key_unit
|
31
masterscraper/core/meta_year.py
Normal file
31
masterscraper/core/meta_year.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
from datetime import date
|
||||
import re
|
||||
|
||||
|
||||
#--------[ Get Variable Year ]--------#
|
||||
def meta_year(self):
|
||||
key_year = []
|
||||
for key in self.data[0]:
|
||||
|
||||
if re.match('\d\d\d\d', key):
|
||||
key_year.append( key )
|
||||
|
||||
elif 'year' in self.info['keys']:
|
||||
y1 = self.data[1][self.info['keys'].index('year')]
|
||||
y2 = self.data[-1][self.info['keys'].index('year')]
|
||||
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
|
||||
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
|
||||
|
||||
elif 'date' in self.info['keys']:
|
||||
y1 = self.data[1][self.info['keys'].index('date')].split('-')[0]
|
||||
y2 = self.data[-1][self.info['keys'].index('date')].split('-')[0]
|
||||
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
|
||||
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
|
||||
|
||||
else:
|
||||
key_year.append( date.today().strftime('%Y') )
|
||||
|
||||
self.info['year'] = key_year
|
117
masterscraper/core/save.py
Normal file
117
masterscraper/core/save.py
Normal file
|
@ -0,0 +1,117 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
import os
|
||||
|
||||
|
||||
#--------[ Save Scrape Data ]--------#
|
||||
def save(self):
|
||||
if len(self.data) <= 1: return(-1) # Break if no data
|
||||
|
||||
|
||||
key_main = 0
|
||||
for i in range(0, len(self.info['keys'])):
|
||||
if( self.data[0][i] == 'country.name' >= 0 or
|
||||
self.data[0][i] == 'year' >= 0 ):
|
||||
key_main = i
|
||||
|
||||
|
||||
for key_data in range(0, len(self.data[0])):
|
||||
if key_data != key_main:
|
||||
|
||||
|
||||
#--------[ Generate Filename ]--------#
|
||||
filename = self.info['keys'][key_data].replace('.','-')
|
||||
|
||||
filepath = 'data/{0}'.format(self.meta['type'])
|
||||
if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-')
|
||||
filepath += '/{0}'.format(self.meta['category'])
|
||||
if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory']
|
||||
if len(self.data[0]) > 4:
|
||||
filepath += '/' + self.meta['name'].lower().replace(' ','-')
|
||||
|
||||
fullpath = filepath + '/' + filename + '.json'
|
||||
|
||||
|
||||
#--------[ Check File Directory ]--------#
|
||||
if not os.path.exists(filepath):
|
||||
os.makedirs(filepath)
|
||||
|
||||
|
||||
#--------[ Open File ]--------#
|
||||
f = open(fullpath, "w")
|
||||
f.write('{\n')
|
||||
|
||||
|
||||
#--------[ Update Metadata ]--------#
|
||||
self.meta['units'] = self.info['units'][key_data]
|
||||
self.meta['year'] = self.info['year'][key_data]
|
||||
|
||||
if self.meta['scope'] == None:
|
||||
self.meta['scope'] = self.info['scope'][key_data]
|
||||
|
||||
#--------[ Write Metadata ]
|
||||
f.write(' "metadata" : {\n')
|
||||
for i in self.meta:
|
||||
if isinstance(self.meta[i], str):
|
||||
f.write(' "{0}" : "{1}"'.format( i, self.meta[i] ))
|
||||
elif self.meta[i] == None:
|
||||
f.write(' "{0}" : null'.format( i ))
|
||||
elif isinstance(self.meta[i], list):
|
||||
if len(self.meta[i]) <= 0:
|
||||
f.write(' "{0}" : []'.format( i ))
|
||||
elif i == 'tags':
|
||||
f.write(' "{0}" : ['.format( i ))
|
||||
for j in self.meta[i]:
|
||||
f.write('"{0}"'.format( j ))
|
||||
if j != self.meta[i][-1]: f.write(',')
|
||||
f.write(']'.format( i ))
|
||||
else:
|
||||
f.write(' "{0}" : [\n'.format( i ))
|
||||
for j in self.meta[i]:
|
||||
f.write(' "{0}"'.format( j ))
|
||||
if j != self.meta[i][-1]: f.write(',\n')
|
||||
else: f.write('\n')
|
||||
f.write(' ]'.format( i ))
|
||||
if i != list(self.meta.keys())[-1]: f.write(',\n')
|
||||
else: f.write('\n')
|
||||
f.write(' },\n')
|
||||
|
||||
|
||||
|
||||
#--------[ Write Actual Data ]--------#
|
||||
f.write(' "data" : [\n')
|
||||
|
||||
if self.meta['type'] == 'historical':
|
||||
f.write(' ["{0}","{1}"],\n'.format(
|
||||
self.info['keys'][key_main],
|
||||
self.meta['id'] + '.' + self.info['keys'][key_data])
|
||||
)
|
||||
else:
|
||||
f.write(' ["{0}","{1}"],\n'.format(
|
||||
self.info['keys'][key_main],
|
||||
self.info['keys'][key_data])
|
||||
)
|
||||
|
||||
for row in self.data[1:]:
|
||||
col_a = row[key_main]
|
||||
col_b = row[key_data]
|
||||
|
||||
if isinstance(col_a, str): col_a = '"{0}"'.format(col_a)
|
||||
if isinstance(col_b, str): col_b = '"{0}"'.format(col_b)
|
||||
|
||||
if col_a == None: col_a = 'null'
|
||||
if col_b == None: col_b = 'null'
|
||||
|
||||
f.write(' [{0},{1}]'.format(col_a, col_b))
|
||||
|
||||
if row != self.data[-1]: f.write(',\n')
|
||||
else: f.write('\n')
|
||||
f.write(' ]\n')
|
||||
|
||||
|
||||
|
||||
#--------[ Final Result ]--------#
|
||||
f.write('}\n')
|
||||
f.close()
|
||||
print(' [{0} data points] -> {1}'.format(len(self.data)-1, fullpath))
|
9
masterscraper/core/show.py
Normal file
9
masterscraper/core/show.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
|
||||
#--------[ Show Scrape Data ]--------#
|
||||
def show(self):
|
||||
print(self.meta)
|
||||
for row in self.data:
|
||||
print(row)
|
|
@ -42,33 +42,32 @@ def scrapelist():
|
|||
print(url['href'])
|
||||
#break
|
||||
|
||||
print('\nScraping {0} datasets from MacroTrends\n'.format( len(full_list) ))
|
||||
print('\nScraping {0} self.datasets from MacroTrends\n'.format( len(full_list) ))
|
||||
|
||||
return full_list
|
||||
|
||||
|
||||
|
||||
|
||||
def scrape(url, meta, data):
|
||||
#--------[ Get Page From URL ]--------#
|
||||
def scrape(self, url):
|
||||
#--------[ Get Page From URL ]--------#
|
||||
soup = getpage(url)
|
||||
|
||||
|
||||
#--------[ Get Metadata ]--------#
|
||||
url_parts = url.split('/')
|
||||
|
||||
meta['name'] = url_parts[-1].replace('-',' ').title()
|
||||
self.meta['name'] = url_parts[-1].replace('-',' ').title()
|
||||
|
||||
soup_desc = getpage( 'https://www.macrotrends.net/countries/ranking/' + url.split('/')[-1] )
|
||||
meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text
|
||||
self.meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text
|
||||
|
||||
meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text )
|
||||
self.meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text )
|
||||
|
||||
meta['sources'].append( url )
|
||||
self.meta['sources'].append( url )
|
||||
|
||||
meta['scope'] = url_parts[-2].replace('-',' ').title()
|
||||
self.meta['scope'] = url_parts[-2].replace('-',' ').title()
|
||||
|
||||
meta['id'] = url_parts[-3].lower()
|
||||
self.meta['id'] = url_parts[-3].lower()
|
||||
|
||||
|
||||
|
||||
|
@ -80,40 +79,40 @@ def scrape(url, meta, data):
|
|||
for tr in table.find_all('tr'):
|
||||
row = [ th.text.strip() for th in tr.find_all('th')]
|
||||
if len(row) > 1:
|
||||
data.append( row )
|
||||
self.data.append( row )
|
||||
|
||||
# Get Table Data
|
||||
for tr in table.find_all('tr'):
|
||||
row = [ td.text.strip() for td in tr.find_all('td')]
|
||||
if len(row) > 1:
|
||||
data.append( row )
|
||||
self.data.append( row )
|
||||
|
||||
#--------[ Process Table ]--------
|
||||
|
||||
# Delete rows with incorrect number of variables
|
||||
key = 0
|
||||
key_len = len(data)
|
||||
key_len = len(self.data)
|
||||
while key < key_len:
|
||||
if len(data[key]) != len(data[0]):
|
||||
data.pop(key)
|
||||
if len(self.data[key]) != len(self.data[0]):
|
||||
self.data.pop(key)
|
||||
key = key-1
|
||||
key = key+1
|
||||
key_len = len(data)
|
||||
key_len = len(self.data)
|
||||
|
||||
# Delete unwanted table columns
|
||||
key = 0
|
||||
key_len = len(data[0])
|
||||
key_len = len(self.data[0])
|
||||
while key < key_len:
|
||||
flag = False
|
||||
if data[0][key].lower().find('rank') >=0: flag = True
|
||||
if data[0][key].lower().find('change') >=0: flag = True
|
||||
if data[0][key].lower().find('notes') >=0: flag = True
|
||||
if data[0][key].lower().find('gap') >=0: flag = True
|
||||
if data[0][key].lower().find('Δ') >=0: flag = True
|
||||
if data[0][key].lower().find('growth') >=0: flag = True
|
||||
if self.data[0][key].lower().find('rank') >=0: flag = True
|
||||
if self.data[0][key].lower().find('change') >=0: flag = True
|
||||
if self.data[0][key].lower().find('notes') >=0: flag = True
|
||||
if self.data[0][key].lower().find('gap') >=0: flag = True
|
||||
if self.data[0][key].lower().find('Δ') >=0: flag = True
|
||||
if self.data[0][key].lower().find('growth') >=0: flag = True
|
||||
if flag:
|
||||
for i in range(0, len(data)):
|
||||
data[i].pop(key)
|
||||
for i in range(0, len(self.data)):
|
||||
self.data[i].pop(key)
|
||||
key = key-1
|
||||
key = key+1
|
||||
key_len = len(data[0])
|
||||
key_len = len(self.data[0])
|
||||
|
|
|
@ -22,15 +22,15 @@ def getpage(url):
|
|||
return soup
|
||||
|
||||
|
||||
def scrape(url, meta, data):
|
||||
def scrape(self, url):
|
||||
#--------[ Get Page From URL ]--------#
|
||||
soup = getpage(url)
|
||||
|
||||
|
||||
#--------[ Get Metadata ]--------#
|
||||
meta['name'] = soup.find('span', class_='mw-page-title-main').text
|
||||
meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ')
|
||||
meta['sources'].append(url)
|
||||
self.meta['name'] = soup.find('span', class_='mw-page-title-main').text
|
||||
self.meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ')
|
||||
self.meta['sources'].append(url)
|
||||
|
||||
|
||||
|
||||
|
@ -41,39 +41,39 @@ def scrape(url, meta, data):
|
|||
for tr in table.find_all('tr'):
|
||||
row = [ th.text.strip() for th in tr.find_all('th')]
|
||||
if len(row) > 1:
|
||||
data.append( row )
|
||||
self.data.append( row )
|
||||
|
||||
# Get Table Data
|
||||
for tr in table.find_all('tr'):
|
||||
row = [ td.text.strip() for td in tr.find_all('td')]
|
||||
if len(row) > 1:
|
||||
data.append( row )
|
||||
self.data.append( row )
|
||||
|
||||
#--------[ Process Table ]--------
|
||||
|
||||
# Delete rows with incorrect number of variables
|
||||
key = 0
|
||||
key_len = len(data)
|
||||
key_len = len(self.data)
|
||||
while key < key_len:
|
||||
if len(data[key]) != len(data[0]):
|
||||
data.pop(key)
|
||||
if len(self.data[key]) != len(self.data[0]):
|
||||
self.data.pop(key)
|
||||
key = key-1
|
||||
key = key+1
|
||||
key_len = len(data)
|
||||
key_len = len(self.data)
|
||||
|
||||
# Delete unwanted table columns
|
||||
key = 0
|
||||
key_len = len(data[0])
|
||||
key_len = len(self.data[0])
|
||||
while key < key_len:
|
||||
flag = False
|
||||
if data[0][key].lower().find('rank') >=0: flag = True
|
||||
if data[0][key].lower().find('change') >=0: flag = True
|
||||
if data[0][key].lower().find('notes') >=0: flag = True
|
||||
if data[0][key].lower().find('gap') >=0: flag = True
|
||||
if data[0][key].lower().find('Δ') >=0: flag = True
|
||||
if self.data[0][key].lower().find('rank') >=0: flag = True
|
||||
if self.data[0][key].lower().find('change') >=0: flag = True
|
||||
if self.data[0][key].lower().find('notes') >=0: flag = True
|
||||
if self.data[0][key].lower().find('gap') >=0: flag = True
|
||||
if self.data[0][key].lower().find('Δ') >=0: flag = True
|
||||
if flag:
|
||||
for i in range(0, len(data)):
|
||||
data[i].pop(key)
|
||||
for i in range(0, len(self.data)):
|
||||
self.data[i].pop(key)
|
||||
key = key-1
|
||||
key = key+1
|
||||
key_len = len(data[0])
|
||||
key_len = len(self.data[0])
|
||||
|
|
|
@ -6,6 +6,7 @@ import masterscraper as ms
|
|||
|
||||
|
||||
scrape = ms.scrape('https://www.macrotrends.net/countries/CHN/china/net-migration')
|
||||
#scrape = ms.scrape('https://www.macrotrends.net/countries/CHN/china/electricity-access-statistics')
|
||||
scrape.get_meta()
|
||||
scrape.clean()
|
||||
scrape.save()
|
||||
|
|
Loading…
Reference in a new issue