Broke program appart into smaller files

This commit is contained in:
colttaine 2023-03-08 14:50:40 +11:00
parent 2cf406bdaa
commit 8cd30b40da
18 changed files with 786 additions and 720 deletions

View file

@ -1,44 +1,26 @@
#!/usr/bin/python3
import pandas as pd
import requests
import re
import os
from bs4 import BeautifulSoup
from datetime import date
from . import wikipedia
from . import macrotrends
# Check If String Is Number
def isfloat(num):
try:
float(num)
return True
except:
return False
# Load URL Scrape List
def scrapelist(filename):
f = open(filename, 'r')
tmp_list = [l.strip() for l in f.readlines()]
f.close()
return(tmp_list)
class scrape:
#--------[ Global Variables ]--------#
meta = {} # Metadata
data = [] # Actual Data
data_info = [] # Variable information to split data table to seperate JSON files
#--------[ Import Module Parts ]--------#
from . import wikipedia
from . import macrotrends
from .core.meta_name import meta_name
from .core.meta_search import meta_search
from .core.meta_keys import meta_keys
from .core.meta_year import meta_year
from .core.meta_units import meta_units
from .core.meta_multiplyer import meta_multiplyer
from .core.meta_scope import meta_scope
from .core.meta_category import meta_category
from .core.meta_type import meta_type
from .core.meta_tags import meta_tags
from .core.get_list import get_list
from .core.show import show
from .core.clean import clean
from .core.save import save
#--------[ Scrape Constructor Object ]--------#
@ -47,28 +29,25 @@ class scrape:
print('\n[{0}]'.format(url))
self.meta = {
"name" : None,
"description" : None,
"units" : None,
"year" : None,
"notes" : [],
"id" : None,
"type" : None,
"scope" : None,
"category" : None,
"subcategory" : None,
"tags" : [],
"authors" : [],
"sources" : []
"name" : None, # Variable/Set name
"description" : None, # Description of variable/set
"units" : None, # Units of variable
"year" : None, # Year(s) of variable
"notes" : [], # Any notes related to the variable/set
"id" : None, # Official ID of applicable
"type" : None, # Type of variable/set
"scope" : None, # Scope of the variable/set
"category" : None, # Main category of the variable/set
"subcategory" : None, # Subcategory of the variable/set
"tags" : [], # Search tags applicable to the variable/set
"authors" : [], # Person or organisation responsible for the data
"sources" : [] # URL Sources for the data
}
self.data = []
self.data_info = []
if url.find('wikipedia.org') >=0:
wikipedia.scrape(url, self.meta, self.data)
if url.find('macrotrends.net') >= 0:
macrotrends.scrape(url, self.meta, self.data)
self.data = [] # The actual data set
self.info = {} # Temoporary metadata extracted from the data set
if url.find('wikipedia.org') >=0: self.wikipedia.scrape(self, url )
if url.find('macrotrends.net') >=0: self.macrotrends.scrape(self, url )
#--------[ Scrape Deconstructor ]--------#
@ -76,623 +55,17 @@ class scrape:
pass
#--------[ Show Scrape Data ]--------#
def show(self):
print(self.meta)
for row in self.data:
print(row)
#--------[ Get Metadata ]--------#
def get_meta(self):
# Break if scrape contains no data
if len(self.data) <= 1: return(1)
# Process Name
self.meta['name'] = self.meta['name'].lower()
self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name'])
self.meta['name'] = re.sub('list\ of\ ','',self.meta['name'])
self.meta['name'] = re.sub(',','',self.meta['name'])
self.meta['name'] = self.meta['name'].strip()
self.meta['name'] = self.meta['name'].title()
self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP')
self.meta['name'] = self.meta['name'].replace('Gni', 'GNI')
self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP')
# Get Key Names Search Spaces
#self.data_info.append( [key for key in self.data[0]])
key_search = []
for i in range(0, len(self.data[0])):
key_search.append(
self.meta['name'].lower() + ' ' +
self.data[0][i].lower() + ' ' +
self.data[1][i].lower()
)
self.data_info.append( key_search )
# Process Variable Key Names
key_name = []
for key in self.data[0]:
if(key.lower().find('country') >=0 or
key.lower().find('countries') >=0 or
key.lower().find('dependency') >=0 ):
key_name.append('country.name')
elif(key.lower().find('year') >=0):
key_name.append('year')
elif(key.lower().find('date') >=0):
key_name.append('date')
else:
tmp_key = key
tmp_key = tmp_key.lower()
tmp_key = re.sub(',', '', tmp_key)
tmp_key = re.sub('\[.*\]', '', tmp_key)
tmp_key = re.sub('\(.*\)', '', tmp_key)
tmp_key = re.sub('km2', '', tmp_key)
tmp_key = re.sub('km', '', tmp_key)
tmp_key = re.sub('mi2', '', tmp_key)
tmp_key = re.sub('hectares', '', tmp_key)
tmp_key = re.sub('\ in\ ', '', tmp_key)
tmp_key = re.sub('US\ \$', '', tmp_key)
tmp_key = re.sub('\$', 'dollars', tmp_key)
tmp_key = re.sub('\%', 'percent', tmp_key)
tmp_key = re.sub('and\ dependencies ', '', tmp_key)
tmp_key = re.sub('list\ of\ countries\ by\ ', '', tmp_key)
tmp_key = re.sub('thousands\ of', '' ,tmp_key)
tmp_key = re.sub('millions\ of', '' ,tmp_key)
tmp_key = re.sub('billions\ of', '' ,tmp_key)
tmp_key = re.sub('per\ 100k\ live\ births', '', tmp_key)
tmp_key = re.sub('per\ 100k\ population', '', tmp_key)
tmp_key = tmp_key.strip()
tmp_key = tmp_key.replace(' ','.')
if tmp_key.find(self.meta['name'].lower().replace(' ','.')) <0:
if tmp_key != '':
tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key
else:
tmp_key = self.meta['name'].lower().replace(' ','.')
key_name.append( tmp_key )
self.data_info.append( key_name )
# Process Variable Unit Type
key_unit = []
for key in self.data_info[0]:
if( key.find('percent') >=0 or
key.find('perc') >=0 or
key.find('%') >=0 ):
key_unit.append('%')
elif( key.find('dollar') >=0 or
key.find('$') >=0 ):
key_unit.append('$')
elif( key.find('euro') >=0 or
key.find('') >=0 ):
key_unit.append('')
elif( key.find('area') >=0 or
key.find('land') >=0 or
key.find('km2') >=0 or
key.find('km²') >=0 or
key.find('mi2') >=0 or
key.find('mi²') >=0 or
key.find('ha') >=0 or
key.find('hectares') >=0 ):
key_unit.append('km²')
elif( key.find('country') >=0 or
key.find('countries') >=0 or
key.find('dependencies') >=0 ):
key_unit.append('countries')
elif( key.find('index') >=0 or
key.find('score') >=0 or
key.find('report') >=0 ):
key_unit.append('index')
elif( key.find('population') >=0 and
key.find('density') <0 and
key.find('access') <0 and
key.find('crime') <0 and
key.find('murder') <0 ):
key_unit.append('people')
elif( key.find('population') >=0 and
key.find('density') >=0 ):
key_unit.append('people/km²')
elif( (key.find('death') >=0 or
key.find('mortality') >=0) and
key.find('rate') >=0 and
key.find('infant') <0 and
key.find('maternal') <0 ):
key_unit.append('deaths/1k population')
elif( key.find('mortality') >=0 and
key.find('rate') >=0 and
key.find('infant') >=0 ):
key_unit.append('deaths/1k live births')
elif( key.find('mortality') >=0 and
key.find('rate') >=0 and
key.find('maternal') >=0 ):
key_unit.append('deaths/100k live births')
elif( key.find('suicide') >=0 and
key.find('rate') >=0 ):
key_unit.append('deaths/100k population')
elif( key.find('life') >=0 and
key.find('expectancy') >=0 ):
key_unit.append('years')
elif( key.find('birth') >=0 and
key.find('rate') >=0 ):
key_unit.append('births/1k population')
elif( key.find('fertility') >=0 and
key.find('rate') >=0 ):
key_unit.append('children/women')
elif( key.find('marriage') >=0 and
key.find('rate') >=0 ):
key_unit.append('marriages/1k population')
elif( key.find('divorce') >=0 and
key.find('rate') >=0 ):
key_unit.append('divorces/1k population')
elif( key.find('crime') >=0 and
key.find('rate') >=0 ):
key_unit.append('crimes/100k population')
elif( key.find('murder') >=0 and
key.find('rate') >=0 ):
key_unit.append('murders/100k population')
elif( key.find('military') >=0 and
key.find('size') >=0 ):
key_unit.append('personel')
elif( key.find('immigration') >=0 or
key.find('migration') >=0 or
key.find('refugee') >=0 ):
key_unit.append('people')
elif( key.find('emissions') >=0 ):
key_unit.append('tonnes')
else:
key_unit.append('unkown')
self.data_info.append( key_unit )
# Process Variable Multiplyer
key_multiplyer = []
for key in self.data_info[0]:
if( key.find('%') >=0 or key.find('percent') >=0 ):
key_multiplyer.append( 0.01 )
elif( re.search('\$.*k', key) ): key_multiplyer.append(1000)
elif( re.search('\$.*m', key) ): key_multiplyer.append(1000000)
elif( re.search('\$.*b', key) ): key_multiplyer.append(1000000000)
elif( key.find('thousands of') >=0 ):
key_multiplyer.append(1000)
elif( key.find('millions of') >=0 ):
key_multiplyer.append(1000000)
elif( key.find('bilions of') >=0 ):
key_multiplyer.append(1000000000)
elif( key.find('mi2') >=0 or key.find('mi²') >=0 ):
key_multiplyer.append(2.59)
elif( key.find('hectare') >=0 ):
key_multiplyer.append(0.01)
else:
key_multiplyer.append( 1.0 )
self.data_info.append( key_multiplyer )
# Get Variable Year
key_year = []
for key in self.data[0]:
if re.match('\d\d\d\d', key):
key_year.append( key )
elif 'year' in self.data_info[1]:
y1 = self.data[1][self.data_info[1].index('year')]
y2 = self.data[-1][self.data_info[1].index('year')]
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
elif 'date' in self.data_info[1]:
y1 = self.data[1][self.data_info[1].index('date')].split('-')[0]
y2 = self.data[-1][self.data_info[1].index('date')].split('-')[0]
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
else:
key_year.append( date.today().strftime('%Y') )
self.data_info.append( key_year )
# Get Variable Type
for key in self.data_info[1]:
if key == 'country.name': self.meta['type'] = 'global'
elif key == 'year': self.meta['type'] = 'historical'
elif key == 'date': self.meta['type'] = 'historical'
elif key == 'us.county.fips': self.meta['type'] = 'regional'
elif key == 'uk.constituency.name': self.meta['type'] = 'regional'
if self.meta['type'] == None: self.meta['type'] = 'unkown'
# Get Variable Scope
key_scope = []
for key in self.data_info[1]:
if key.find('male') >=0: key_scope.append( 'male' )
elif key.find('female') >=0: key_scope.append( 'female' )
elif key.find('black') >=0: key_scope.append( 'black' )
elif key.find('white') >=0: key_scope.append( 'white' )
elif key.find('asian') >=0: key_scope.append( 'asian' )
elif key.find('native') >=0: key_scope.append( 'native' )
elif key.find('urban') >=0: key_scope.append( 'urban' )
elif key.find('rural') >=0: key_scope.append( 'rural' )
else: key_scope.append( self.meta['type'] )
self.data_info.append( key_scope )
# Get Variable Category
search = self.meta['name'].join(self.data_info[0]).lower().strip()
#--------[ Geographic ]--------#
if( search.find('area') >=0 or
search.find('km2') >=0 ):
self.meta['category'] = 'geographic'
self.meta['subcategory'] = 'area'
elif( (search.find('arable') >=0 or
search.find('farm') >=0 or
search.find('forrested') >=0) and
search.find('land') >=0 ):
self.meta['category'] = 'geographic'
self.meta['subcategory'] = 'land'
#--------[ Demographic ]--------
elif( search.find('population') >=0 and
search.find('access') <0 and
search.find('murder') <0 and
search.find('crime') <0 and
search.find('hunger') <0 and
search.find('migrat') <0 and
search.find('migrant') <0 ):
self.meta['category'] = 'demogrpahic'
self.meta['subcategory'] = 'population'
elif( (search.find('birth') >=0 or
search.find('fertility') >=0) and
search.find('mortality') <0 ):
self.meta['category'] = 'demogrpahic'
self.meta['subcategory'] = 'fertility'
elif( search.find('immigrat') >=0 or
search.find('migrat') >=0 or
search.find('migrant') >=0 or
search.find('refugee') >=0 or
search.find('asylum') >=0 ):
self.meta['category'] = 'demogrpahic'
self.meta['subcategory'] = 'migration'
#--------[ Health ]--------#
elif( search.find('life expectancy') >=0 or
search.find('death') >=0 or
search.find('suicide') >=0 or
search.find('mortality') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'mortality'
elif( search.find('depression') >=0 or
search.find('anxiety') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'psychology'
elif( search.find('smoking') >= 0 or
search.find('alcohol') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'drugs'
#--------[ Economic ]--------#
elif( search.find('gdp') >=0 and
search.find('trade') <0 and
search.find('import') <0 and
search.find('export') <0 and
search.find('invest') <0 and
search.find('spending') <0 and
search.find('manufactur') <0 and
search.find('military') <0 and
search.find('education') <0 and
search.find('health') <0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'gdp'
elif( search.find('gni') >=0 or
search.find('gnp') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'gni'
elif( search.find('debt') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'debt'
elif( search.find('inflation') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'inflation'
elif( search.find('health') >=0 and
search.find('spend') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'welfare'
elif( search.find('manufactur') >=0 or
search.find('business') >=0 or
search.find('tourism') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'business'
elif( search.find('import') >=0 or
search.find('export') >=0 or
search.find('invest') >=0 or
search.find('tariff') >=0 or
search.find('trade') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'trade'
elif( search.find('unemployment') >=0 or
search.find('labor') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'labor-force'
#--------[ Development ]--------#
elif( search.find('education') >=0 or
search.find('literacy') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'education'
elif( search.find('electricity access') >=0 or
search.find('water access') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'infrastructure'
elif( search.find('development') >=0 or
search.find('competitive') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'technology'
elif( search.find('hunger') >=0 or
search.find('poverty') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'quality-of-life'
elif( search.find('co2') >=0 or
search.find('ghg') >=0 or
search.find('emissions') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'emissions'
elif( search.find('fuel') >=0 or
search.find('coal') >=0 or
search.find('energy') >=0 or
search.find('renewable') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'energy'
#--------[ Crime ]--------#
elif( search.find('crime') >=0 or
search.find('homocide') >=0 or
search.find('murder') >=0 ):
self.meta['category'] = 'crime'
#--------[ Military ]--------#
elif( search.find('military') >=0 ):
self.meta['category'] = 'military'
#--------[ Uncategorised ]--------#
else:
self.meta['category'] = 'uncategorised'
# Get Tags
if not self.meta['type'] in self.meta['tags']: self.meta['tags'].append(self.meta['type'])
if not self.meta['category'] in self.meta['tags']: self.meta['tags'].append(self.meta['category'])
if not self.meta['subcategory'] in self.meta['tags']: self.meta['tags'].append(self.meta['subcategory'])
for scope in key_scope:
if not scope in self.meta['tags']:
self.meta['tags'].append(scope)
if scope == 'female' or scope == 'male':
self.meta['tags'].append('gender')
if scope == 'black' or scope == 'white' or scope == 'asian' or scope == 'native':
self.meta['tags'].append('race')
if 'None' in self.meta['tags']:
self.meta['tags'].pop( self.meta['tags'].index('None') )
#--------[ Clean Scrape Data ]--------#
def clean(self):
# Break if scrape contains no data
if len(self.data) <= 1: return(1)
for x in range(1, len(self.data)):
for y in range(0, len(self.data[x])):
self.data[x][y] = self.data[x][y]
# Remove any inline notes from data
if isinstance(self.data[x][y], str):
self.data[x][y] = re.sub('\[.*\]','', self.data[x][y])
self.data[x][y] = re.sub('\(.*\)','', self.data[x][y])
self.data[x][y] = re.sub(',','', self.data[x][y])
# Convert numerical strings to floats
if isinstance(self.data[x][y], str):
self.data[x][y] = self.data[x][y].strip()
if any(i.isdigit() for i in self.data[x][y]):
self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.' or i=='-'])
# Convert To Float
if isfloat(self.data[x][y]):
self.data[x][y] = float(self.data[x][y])
# Apply Variable Multiplyer
self.data[x][y] = self.data[x][y] * self.data_info[3][y]
# Convert Whole Floats To Integers
if self.data[x][y].is_integer():
self.data[x][y] = int(self.data[x][y])
# Convert non-entries to null
if isinstance(self.data[x][y], str):
if( self.data[x][y].lower().find('not determined') >= 0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('unkown') >= 0 ):
self.data[x][y] = None
if( self.data[x][y] == '-' or
self.data[x][y] == '' ):
self.data[x][y] = None
#--------[ Save Scrape Data ]--------#
def save(self):
# Break if scrape contains no data
if len(self.data) <= 1: return(1)
key_main = 0
for i in range(0, len(self.data_info[1])):
if( self.data[0][i] == 'country.name' >= 0 or
self.data[0][i] == 'year' >= 0 ):
key_main = i
for key_data in range(0, len(self.data[0])):
if key_data != key_main:
#--------[ Generate Filename ]--------#
filename = self.data_info[1][key_data].replace('.','-')
filepath = 'data/{0}'.format(self.meta['type'])
if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-')
filepath += '/{0}'.format(self.meta['category'])
if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory']
if len(self.data[0]) > 4:
filepath += '/' + self.meta['name'].lower().replace(' ','-')
fullpath = filepath + '/' + filename + '.json'
#--------[ Check File Directory ]--------#
if not os.path.exists(filepath):
os.makedirs(filepath)
#--------[ Open File ]--------#
f = open(fullpath, "w")
f.write('{\n')
#--------[ Update Metadata ]--------#
self.meta['units'] = self.data_info[2][key_data]
self.meta['year'] = self.data_info[4][key_data]
if self.meta['scope'] == None:
self.meta['scope'] = self.data_info[5][key_data]
#--------[ Write Metadata ]
f.write(' "metadata" : {\n')
for i in self.meta:
if isinstance(self.meta[i], str):
f.write(' "{0}" : "{1}"'.format( i, self.meta[i] ))
elif self.meta[i] == None:
f.write(' "{0}" : null'.format( i ))
elif isinstance(self.meta[i], list):
if len(self.meta[i]) <= 0:
f.write(' "{0}" : []'.format( i ))
elif i == 'tags':
f.write(' "{0}" : ['.format( i ))
for j in self.meta[i]:
f.write('"{0}"'.format( j ))
if j != self.meta[i][-1]: f.write(',')
f.write(']'.format( i ))
else:
f.write(' "{0}" : [\n'.format( i ))
for j in self.meta[i]:
f.write(' "{0}"'.format( j ))
if j != self.meta[i][-1]: f.write(',\n')
else: f.write('\n')
f.write(' ]'.format( i ))
if i != list(self.meta.keys())[-1]: f.write(',\n')
else: f.write('\n')
f.write(' },\n')
#--------[ Write Actual Data ]--------#
f.write(' "data" : [\n')
if self.meta['type'] == 'historical':
f.write(' ["{0}","{1}"],\n'.format(
self.data_info[1][key_main],
self.meta['id'] + '.' + self.data_info[1][key_data])
)
else:
f.write(' ["{0}","{1}"],\n'.format(
self.data_info[1][key_main],
self.data_info[1][key_data])
)
for row in self.data[1:]:
col_a = row[key_main]
col_b = row[key_data]
if isinstance(col_a, str): col_a = '"{0}"'.format(col_a)
if isinstance(col_b, str): col_b = '"{0}"'.format(col_b)
if col_a == None: col_a = 'null'
if col_b == None: col_b = 'null'
f.write(' [{0},{1}]'.format(col_a, col_b))
if row != self.data[-1]: f.write(',\n')
else: f.write('\n')
f.write(' ]\n')
#--------[ Final Result ]--------#
f.write('}\n')
f.close()
print(' [{0} data points] -> {1}'.format(len(self.data)-1, fullpath))
if len(self.data) <= 1: return(-1) # Break if no data
self.meta_name() # Clean set name
self.meta_search() # Create search-space
self.meta_keys() # Extract variable key-name
self.meta_year() # Extract variable year
self.meta_units() # Extract variable unit
self.meta_multiplyer() # Extract variable multiplyer
self.meta_scope() # Extract variable scope
self.meta_category() # Extract set category
self.meta_type() # Extract set type
self.meta_tags() # Extract set tag

View file

@ -0,0 +1,56 @@
#!/usr/bin/env python3
import re
# Check If String Is Number
def isfloat(num):
try:
float(num)
return True
except:
return False
#--------[ Clean Scrape Data ]--------#
def clean(self):
if len(self.data) <= 1: return(-1) # Break if no data
for x in range(1, len(self.data)):
for y in range(0, len(self.data[x])):
self.data[x][y] = self.data[x][y]
# Remove any inline notes from data
if isinstance(self.data[x][y], str):
self.data[x][y] = re.sub('\[.*\]','', self.data[x][y])
self.data[x][y] = re.sub('\(.*\)','', self.data[x][y])
self.data[x][y] = re.sub(',','', self.data[x][y])
# Convert numerical strings to floats
if isinstance(self.data[x][y], str):
self.data[x][y] = self.data[x][y].strip()
if any(i.isdigit() for i in self.data[x][y]):
self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.' or i=='-'])
# Convert To Float
if isfloat(self.data[x][y]):
self.data[x][y] = float(self.data[x][y])
# Apply Variable Multiplyer
self.data[x][y] = self.data[x][y] * self.info['multiplyer'][y]
# Convert Whole Floats To Integers
if self.data[x][y].is_integer():
self.data[x][y] = int(self.data[x][y])
# Convert non-entries to null
if isinstance(self.data[x][y], str):
if( self.data[x][y].lower().find('not determined') >= 0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('unkown') >= 0 ):
self.data[x][y] = None
if( self.data[x][y] == '-' or
self.data[x][y] == '' ):
self.data[x][y] = None

View file

@ -0,0 +1,10 @@
#!/usr/bin/env python3
#--------[ Load URL Scrape List ]--------#
def get_list(filename):
f = open(filename, 'r')
tmp_list = [l.strip() for l in f.readlines()]
f.close()
return(tmp_list)

View file

@ -0,0 +1,169 @@
#!/usr/bin/env python3
#--------[ Extract Category Information ]--------#
def meta_category(self):
search = self.meta['name'].join(self.info['search']).lower().strip()
#--------[ Geographic ]--------#
if( search.find('area') >=0 or
search.find('km2') >=0 ):
self.meta['category'] = 'geographic'
self.meta['subcategory'] = 'area'
elif( (search.find('arable') >=0 or
search.find('farm') >=0 or
search.find('forrested') >=0) and
search.find('land') >=0 ):
self.meta['category'] = 'geographic'
self.meta['subcategory'] = 'land'
#--------[ Demographic ]--------
elif( search.find('population') >=0 and
search.find('access') <0 and
search.find('murder') <0 and
search.find('crime') <0 and
search.find('hunger') <0 and
search.find('migrat') <0 and
search.find('migrant') <0 ):
self.meta['category'] = 'demogrpahic'
self.meta['subcategory'] = 'population'
elif( (search.find('birth') >=0 or
search.find('fertility') >=0) and
search.find('mortality') <0 ):
self.meta['category'] = 'demogrpahic'
self.meta['subcategory'] = 'fertility'
elif( search.find('immigrat') >=0 or
search.find('migrat') >=0 or
search.find('migrant') >=0 or
search.find('refugee') >=0 or
search.find('asylum') >=0 ):
self.meta['category'] = 'demogrpahic'
self.meta['subcategory'] = 'migration'
#--------[ Health ]--------#
elif( search.find('life expectancy') >=0 or
search.find('death') >=0 or
search.find('suicide') >=0 or
search.find('mortality') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'mortality'
elif( search.find('depression') >=0 or
search.find('anxiety') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'psychology'
elif( search.find('smoking') >= 0 or
search.find('alcohol') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'drugs'
#--------[ Economic ]--------#
elif( search.find('gdp') >=0 and
search.find('trade') <0 and
search.find('import') <0 and
search.find('export') <0 and
search.find('invest') <0 and
search.find('spending') <0 and
search.find('manufactur') <0 and
search.find('military') <0 and
search.find('education') <0 and
search.find('health') <0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'gdp'
elif( search.find('gni') >=0 or
search.find('gnp') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'gni'
elif( search.find('debt') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'debt'
elif( search.find('inflation') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'inflation'
elif( search.find('health') >=0 and
search.find('spend') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'welfare'
elif( search.find('manufactur') >=0 or
search.find('business') >=0 or
search.find('tourism') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'business'
elif( search.find('import') >=0 or
search.find('export') >=0 or
search.find('invest') >=0 or
search.find('tariff') >=0 or
search.find('trade') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'trade'
elif( search.find('unemployment') >=0 or
search.find('labor') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'labor-force'
#--------[ Development ]--------#
elif( search.find('education') >=0 or
search.find('literacy') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'education'
elif( search.find('electricity access') >=0 or
search.find('water access') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'infrastructure'
elif( search.find('development') >=0 or
search.find('competitive') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'technology'
elif( search.find('hunger') >=0 or
search.find('poverty') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'quality-of-life'
elif( search.find('co2') >=0 or
search.find('ghg') >=0 or
search.find('emissions') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'emissions'
elif( search.find('fuel') >=0 or
search.find('coal') >=0 or
search.find('energy') >=0 or
search.find('renewable') >=0 ):
self.meta['category'] = 'development'
self.meta['subcategory'] = 'energy'
#--------[ Crime ]--------#
elif( search.find('crime') >=0 or
search.find('homocide') >=0 or
search.find('murder') >=0 ):
self.meta['category'] = 'crime'
#--------[ Military ]--------#
elif( search.find('military') >=0 ):
self.meta['category'] = 'military'
#--------[ Uncategorised ]--------#
else:
self.meta['category'] = 'uncategorised'

View file

@ -0,0 +1,58 @@
#!/usr/bin/env python3
import re
#--------[ Process Variable Key Names ]--------
def meta_keys(self):
key_name = []
for key in self.data[0]:
if(key.lower().find('country') >=0 or
key.lower().find('countries') >=0 or
key.lower().find('dependency') >=0 ):
key_name.append('country.name')
elif(key.lower().find('year') >=0):
key_name.append('year')
elif(key.lower().find('date') >=0):
key_name.append('date')
else:
tmp_key = key
tmp_key = tmp_key.lower()
tmp_key = re.sub(',', '', tmp_key)
tmp_key = re.sub('\[.*\]', '', tmp_key)
tmp_key = re.sub('\(.*\)', '', tmp_key)
tmp_key = re.sub('km2', '', tmp_key)
tmp_key = re.sub('km', '', tmp_key)
tmp_key = re.sub('mi2', '', tmp_key)
tmp_key = re.sub('hectares', '', tmp_key)
tmp_key = re.sub('\ in\ ', '', tmp_key)
tmp_key = re.sub('US\ \$', '', tmp_key)
tmp_key = re.sub('\$', 'dollars', tmp_key)
tmp_key = re.sub('\%', 'percent', tmp_key)
tmp_key = re.sub('and\ dependencies ', '', tmp_key)
tmp_key = re.sub('list\ of\ countries\ by\ ', '', tmp_key)
tmp_key = re.sub('thousands\ of', '' ,tmp_key)
tmp_key = re.sub('millions\ of', '' ,tmp_key)
tmp_key = re.sub('billions\ of', '' ,tmp_key)
tmp_key = re.sub('per\ 100k\ live\ births', '', tmp_key)
tmp_key = re.sub('per\ 100k\ population', '', tmp_key)
tmp_key = tmp_key.strip()
tmp_key = tmp_key.replace(' ','.')
if tmp_key.find(self.meta['name'].lower().replace(' ','.')) <0:
if tmp_key != '':
tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key
else:
tmp_key = self.meta['name'].lower().replace(' ','.')
#--------[ Add Name To Info Array ]--------#
key_name.append( tmp_key )
self.info['keys'] = key_name

View file

@ -0,0 +1,36 @@
#!/usr/bin/env python3
import re
#--------[ Process Variable Multiplyer
def meta_multiplyer(self):
key_multiplyer = []
for key in self.info['search']:
if( key.find('%') >=0 or key.find('percent') >=0 ):
key_multiplyer.append( 0.01 )
elif( re.search('\$.*k', key) ): key_multiplyer.append(1000)
elif( re.search('\$.*m', key) ): key_multiplyer.append(1000000)
elif( re.search('\$.*b', key) ): key_multiplyer.append(1000000000)
elif( key.find('thousands of') >=0 ):
key_multiplyer.append(1000)
elif( key.find('millions of') >=0 ):
key_multiplyer.append(1000000)
elif( key.find('bilions of') >=0 ):
key_multiplyer.append(1000000000)
elif( key.find('mi2') >=0 or key.find('mi²') >=0 ):
key_multiplyer.append(2.59)
elif( key.find('hectare') >=0 ):
key_multiplyer.append(0.01)
else:
key_multiplyer.append( 1.0 )
self.info['multiplyer'] = key_multiplyer

View file

@ -0,0 +1,22 @@
#!/usr/bin/env python3
import re
#--------[ Process Vaiable Set Names ]-------#-
def meta_name(self):
self.meta['name'] = self.meta['name'].lower()
self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name'])
self.meta['name'] = re.sub('list\ of\ ','',self.meta['name'])
self.meta['name'] = re.sub(',','',self.meta['name'])
self.meta['name'] = self.meta['name'].strip()
self.meta['name'] = self.meta['name'].title()
self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP')
self.meta['name'] = self.meta['name'].replace('Gni', 'GNI')
self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP')

View file

@ -0,0 +1,17 @@
#!/usr/bin/env python3
#--------[ Get Variable Scope ]--------#
def meta_scope(self):
key_scope = []
for key in self.info['search']:
if key.find('female') >=0: key_scope.append( 'female' )
elif key.find('male') >=0: key_scope.append( 'male' )
elif key.find('black') >=0: key_scope.append( 'black' )
elif key.find('white') >=0: key_scope.append( 'white' )
elif key.find('asian') >=0: key_scope.append( 'asian' )
elif key.find('native') >=0: key_scope.append( 'native' )
elif key.find('urban') >=0: key_scope.append( 'urban' )
elif key.find('rural') >=0: key_scope.append( 'rural' )
else: key_scope.append( self.meta['type'] )
self.info['scope'] = key_scope

View file

@ -0,0 +1,15 @@
#!/usr/bin/env python3
#--------[ Create Variable Search Space ]--------#
def meta_search(self):
key_search = []
for i in range(0, len(self.data[0])):
key_search.append(
self.meta['name'].lower() + ' ' +
self.data[0][i].lower() + ' ' +
self.data[1][i].lower()
)
self.info['search'] = key_search

View file

@ -0,0 +1,24 @@
#!/usr/bin/env python3
#--------[ Extract Tag Information ]--------#
def meta_tags(self):
if not self.meta['type'] in self.meta['tags']: self.meta['tags'].append(self.meta['type'])
if not self.meta['category'] in self.meta['tags']: self.meta['tags'].append(self.meta['category'])
if not self.meta['subcategory'] in self.meta['tags']: self.meta['tags'].append(self.meta['subcategory'])
if self.meta['scope'] != None:
if not self.meta['scope'].lower() in self.meta['tags']:
self.meta['tags'].append( self.meta['scope'].lower() )
for scope in self.info['scope']:
if not scope in self.meta['tags']:
self.meta['tags'].append(scope)
if scope == 'female' or scope == 'male':
self.meta['tags'].append('gender')
if scope == 'black' or scope == 'white' or scope == 'asian' or scope == 'native':
self.meta['tags'].append('race')
if None in self.meta['tags']:
self.meta['tags'].pop( self.meta['tags'].index(None) )
if 'None' in self.meta['tags']:
self.meta['tags'].pop( self.meta['tags'].index('None') )

View file

@ -0,0 +1,12 @@
#!/usr/bin/env python3
#--------[ Get Variable Type ]--------#
def meta_type(self):
for key in self.info['keys']:
if key == 'country.name': self.meta['type'] = 'global'
elif key == 'year': self.meta['type'] = 'historical'
elif key == 'date': self.meta['type'] = 'historical'
elif key == 'us.county.fips': self.meta['type'] = 'regional'
elif key == 'uk.constituency.name': self.meta['type'] = 'regional'
if self.meta['type'] == None: self.meta['type'] = 'unkown'

View file

@ -0,0 +1,117 @@
#!/usr/bin/env python3
#--------[ Process Variable Unit Type ]--------#
def meta_units(self):
key_unit = []
for key in self.info['search']:
if( key.find('percent') >=0 or
key.find('perc') >=0 or
key.find('%') >=0 ):
key_unit.append('%')
elif( key.find('dollar') >=0 or
key.find('$') >=0 ):
key_unit.append('$')
elif( key.find('euro') >=0 or
key.find('') >=0 ):
key_unit.append('')
elif( key.find('area') >=0 or
key.find('land') >=0 or
key.find('km2') >=0 or
key.find('km²') >=0 or
key.find('mi2') >=0 or
key.find('mi²') >=0 or
key.find('ha') >=0 or
key.find('hectares') >=0 ):
key_unit.append('km²')
elif( key.find('country') >=0 or
key.find('countries') >=0 or
key.find('dependencies') >=0 ):
key_unit.append('countries')
elif( key.find('index') >=0 or
key.find('score') >=0 or
key.find('report') >=0 ):
key_unit.append('index')
elif( key.find('population') >=0 and
key.find('density') <0 and
key.find('access') <0 and
key.find('crime') <0 and
key.find('murder') <0 ):
key_unit.append('people')
elif( key.find('population') >=0 and
key.find('density') >=0 ):
key_unit.append('people/km²')
elif( (key.find('death') >=0 or
key.find('mortality') >=0) and
key.find('rate') >=0 and
key.find('infant') <0 and
key.find('maternal') <0 ):
key_unit.append('deaths/1k population')
elif( key.find('mortality') >=0 and
key.find('rate') >=0 and
key.find('infant') >=0 ):
key_unit.append('deaths/1k live births')
elif( key.find('mortality') >=0 and
key.find('rate') >=0 and
key.find('maternal') >=0 ):
key_unit.append('deaths/100k live births')
elif( key.find('suicide') >=0 and
key.find('rate') >=0 ):
key_unit.append('deaths/100k population')
elif( key.find('life') >=0 and
key.find('expectancy') >=0 ):
key_unit.append('years')
elif( key.find('birth') >=0 and
key.find('rate') >=0 ):
key_unit.append('births/1k population')
elif( key.find('fertility') >=0 and
key.find('rate') >=0 ):
key_unit.append('children/women')
elif( key.find('marriage') >=0 and
key.find('rate') >=0 ):
key_unit.append('marriages/1k population')
elif( key.find('divorce') >=0 and
key.find('rate') >=0 ):
key_unit.append('divorces/1k population')
elif( key.find('crime') >=0 and
key.find('rate') >=0 ):
key_unit.append('crimes/100k population')
elif( key.find('murder') >=0 and
key.find('rate') >=0 ):
key_unit.append('murders/100k population')
elif( key.find('military') >=0 and
key.find('size') >=0 ):
key_unit.append('personel')
elif( key.find('immigration') >=0 or
key.find('migration') >=0 or
key.find('refugee') >=0 ):
key_unit.append('people')
elif( key.find('emissions') >=0 ):
key_unit.append('tonnes')
else:
key_unit.append('unkown')
self.info['units'] = key_unit

View file

@ -0,0 +1,31 @@
#!/usr/bin/env python3
from datetime import date
import re
#--------[ Get Variable Year ]--------#
def meta_year(self):
key_year = []
for key in self.data[0]:
if re.match('\d\d\d\d', key):
key_year.append( key )
elif 'year' in self.info['keys']:
y1 = self.data[1][self.info['keys'].index('year')]
y2 = self.data[-1][self.info['keys'].index('year')]
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
elif 'date' in self.info['keys']:
y1 = self.data[1][self.info['keys'].index('date')].split('-')[0]
y2 = self.data[-1][self.info['keys'].index('date')].split('-')[0]
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
else:
key_year.append( date.today().strftime('%Y') )
self.info['year'] = key_year

117
masterscraper/core/save.py Normal file
View file

@ -0,0 +1,117 @@
#!/usr/bin/env python3
import os
#--------[ Save Scrape Data ]--------#
def save(self):
if len(self.data) <= 1: return(-1) # Break if no data
key_main = 0
for i in range(0, len(self.info['keys'])):
if( self.data[0][i] == 'country.name' >= 0 or
self.data[0][i] == 'year' >= 0 ):
key_main = i
for key_data in range(0, len(self.data[0])):
if key_data != key_main:
#--------[ Generate Filename ]--------#
filename = self.info['keys'][key_data].replace('.','-')
filepath = 'data/{0}'.format(self.meta['type'])
if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-')
filepath += '/{0}'.format(self.meta['category'])
if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory']
if len(self.data[0]) > 4:
filepath += '/' + self.meta['name'].lower().replace(' ','-')
fullpath = filepath + '/' + filename + '.json'
#--------[ Check File Directory ]--------#
if not os.path.exists(filepath):
os.makedirs(filepath)
#--------[ Open File ]--------#
f = open(fullpath, "w")
f.write('{\n')
#--------[ Update Metadata ]--------#
self.meta['units'] = self.info['units'][key_data]
self.meta['year'] = self.info['year'][key_data]
if self.meta['scope'] == None:
self.meta['scope'] = self.info['scope'][key_data]
#--------[ Write Metadata ]
f.write(' "metadata" : {\n')
for i in self.meta:
if isinstance(self.meta[i], str):
f.write(' "{0}" : "{1}"'.format( i, self.meta[i] ))
elif self.meta[i] == None:
f.write(' "{0}" : null'.format( i ))
elif isinstance(self.meta[i], list):
if len(self.meta[i]) <= 0:
f.write(' "{0}" : []'.format( i ))
elif i == 'tags':
f.write(' "{0}" : ['.format( i ))
for j in self.meta[i]:
f.write('"{0}"'.format( j ))
if j != self.meta[i][-1]: f.write(',')
f.write(']'.format( i ))
else:
f.write(' "{0}" : [\n'.format( i ))
for j in self.meta[i]:
f.write(' "{0}"'.format( j ))
if j != self.meta[i][-1]: f.write(',\n')
else: f.write('\n')
f.write(' ]'.format( i ))
if i != list(self.meta.keys())[-1]: f.write(',\n')
else: f.write('\n')
f.write(' },\n')
#--------[ Write Actual Data ]--------#
f.write(' "data" : [\n')
if self.meta['type'] == 'historical':
f.write(' ["{0}","{1}"],\n'.format(
self.info['keys'][key_main],
self.meta['id'] + '.' + self.info['keys'][key_data])
)
else:
f.write(' ["{0}","{1}"],\n'.format(
self.info['keys'][key_main],
self.info['keys'][key_data])
)
for row in self.data[1:]:
col_a = row[key_main]
col_b = row[key_data]
if isinstance(col_a, str): col_a = '"{0}"'.format(col_a)
if isinstance(col_b, str): col_b = '"{0}"'.format(col_b)
if col_a == None: col_a = 'null'
if col_b == None: col_b = 'null'
f.write(' [{0},{1}]'.format(col_a, col_b))
if row != self.data[-1]: f.write(',\n')
else: f.write('\n')
f.write(' ]\n')
#--------[ Final Result ]--------#
f.write('}\n')
f.close()
print(' [{0} data points] -> {1}'.format(len(self.data)-1, fullpath))

View file

@ -0,0 +1,9 @@
#!/usr/bin/env python3
#--------[ Show Scrape Data ]--------#
def show(self):
print(self.meta)
for row in self.data:
print(row)

View file

@ -42,33 +42,32 @@ def scrapelist():
print(url['href'])
#break
print('\nScraping {0} datasets from MacroTrends\n'.format( len(full_list) ))
print('\nScraping {0} self.datasets from MacroTrends\n'.format( len(full_list) ))
return full_list
def scrape(url, meta, data):
#--------[ Get Page From URL ]--------#
def scrape(self, url):
#--------[ Get Page From URL ]--------#
soup = getpage(url)
#--------[ Get Metadata ]--------#
url_parts = url.split('/')
meta['name'] = url_parts[-1].replace('-',' ').title()
self.meta['name'] = url_parts[-1].replace('-',' ').title()
soup_desc = getpage( 'https://www.macrotrends.net/countries/ranking/' + url.split('/')[-1] )
meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text
self.meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text
meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text )
self.meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text )
meta['sources'].append( url )
self.meta['sources'].append( url )
meta['scope'] = url_parts[-2].replace('-',' ').title()
self.meta['scope'] = url_parts[-2].replace('-',' ').title()
meta['id'] = url_parts[-3].lower()
self.meta['id'] = url_parts[-3].lower()
@ -80,40 +79,40 @@ def scrape(url, meta, data):
for tr in table.find_all('tr'):
row = [ th.text.strip() for th in tr.find_all('th')]
if len(row) > 1:
data.append( row )
self.data.append( row )
# Get Table Data
for tr in table.find_all('tr'):
row = [ td.text.strip() for td in tr.find_all('td')]
if len(row) > 1:
data.append( row )
self.data.append( row )
#--------[ Process Table ]--------
# Delete rows with incorrect number of variables
key = 0
key_len = len(data)
key_len = len(self.data)
while key < key_len:
if len(data[key]) != len(data[0]):
data.pop(key)
if len(self.data[key]) != len(self.data[0]):
self.data.pop(key)
key = key-1
key = key+1
key_len = len(data)
key_len = len(self.data)
# Delete unwanted table columns
key = 0
key_len = len(data[0])
key_len = len(self.data[0])
while key < key_len:
flag = False
if data[0][key].lower().find('rank') >=0: flag = True
if data[0][key].lower().find('change') >=0: flag = True
if data[0][key].lower().find('notes') >=0: flag = True
if data[0][key].lower().find('gap') >=0: flag = True
if data[0][key].lower().find('Δ') >=0: flag = True
if data[0][key].lower().find('growth') >=0: flag = True
if self.data[0][key].lower().find('rank') >=0: flag = True
if self.data[0][key].lower().find('change') >=0: flag = True
if self.data[0][key].lower().find('notes') >=0: flag = True
if self.data[0][key].lower().find('gap') >=0: flag = True
if self.data[0][key].lower().find('Δ') >=0: flag = True
if self.data[0][key].lower().find('growth') >=0: flag = True
if flag:
for i in range(0, len(data)):
data[i].pop(key)
for i in range(0, len(self.data)):
self.data[i].pop(key)
key = key-1
key = key+1
key_len = len(data[0])
key_len = len(self.data[0])

View file

@ -22,15 +22,15 @@ def getpage(url):
return soup
def scrape(url, meta, data):
def scrape(self, url):
#--------[ Get Page From URL ]--------#
soup = getpage(url)
#--------[ Get Metadata ]--------#
meta['name'] = soup.find('span', class_='mw-page-title-main').text
meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ')
meta['sources'].append(url)
self.meta['name'] = soup.find('span', class_='mw-page-title-main').text
self.meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ')
self.meta['sources'].append(url)
@ -41,39 +41,39 @@ def scrape(url, meta, data):
for tr in table.find_all('tr'):
row = [ th.text.strip() for th in tr.find_all('th')]
if len(row) > 1:
data.append( row )
self.data.append( row )
# Get Table Data
for tr in table.find_all('tr'):
row = [ td.text.strip() for td in tr.find_all('td')]
if len(row) > 1:
data.append( row )
self.data.append( row )
#--------[ Process Table ]--------
# Delete rows with incorrect number of variables
key = 0
key_len = len(data)
key_len = len(self.data)
while key < key_len:
if len(data[key]) != len(data[0]):
data.pop(key)
if len(self.data[key]) != len(self.data[0]):
self.data.pop(key)
key = key-1
key = key+1
key_len = len(data)
key_len = len(self.data)
# Delete unwanted table columns
key = 0
key_len = len(data[0])
key_len = len(self.data[0])
while key < key_len:
flag = False
if data[0][key].lower().find('rank') >=0: flag = True
if data[0][key].lower().find('change') >=0: flag = True
if data[0][key].lower().find('notes') >=0: flag = True
if data[0][key].lower().find('gap') >=0: flag = True
if data[0][key].lower().find('Δ') >=0: flag = True
if self.data[0][key].lower().find('rank') >=0: flag = True
if self.data[0][key].lower().find('change') >=0: flag = True
if self.data[0][key].lower().find('notes') >=0: flag = True
if self.data[0][key].lower().find('gap') >=0: flag = True
if self.data[0][key].lower().find('Δ') >=0: flag = True
if flag:
for i in range(0, len(data)):
data[i].pop(key)
for i in range(0, len(self.data)):
self.data[i].pop(key)
key = key-1
key = key+1
key_len = len(data[0])
key_len = len(self.data[0])

View file

@ -6,6 +6,7 @@ import masterscraper as ms
scrape = ms.scrape('https://www.macrotrends.net/countries/CHN/china/net-migration')
#scrape = ms.scrape('https://www.macrotrends.net/countries/CHN/china/electricity-access-statistics')
scrape.get_meta()
scrape.clean()
scrape.save()