Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 41 additions & 20 deletions datacommons/datacommons.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,27 +193,30 @@ def expand(self,
pd_table[new_col_name] = ""
pd_table[new_col_name][0] = new_col_type
return pd_table

seed_col_var = seed_col_name.replace(' ', '_')
new_col_var = new_col_name.replace(' ', '_')
if outgoing:
query = ('SELECT ?{seed_col_name} ?{new_col_name},'
query = ('SELECT ?{seed_col_var} ?{new_col_var},'
'typeOf ?node {seed_col_type},'
'dcid ?node {dcids},'
'dcid ?node ?{seed_col_name},'
'{arc_name} ?node ?{new_col_name}').format(
'dcid ?node ?{seed_col_var},'
'{arc_name} ?node ?{new_col_var}').format(
arc_name=arc_name,
seed_col_name=seed_col_name,
seed_col_var=seed_col_var,
seed_col_type=seed_col_type,
new_col_name=new_col_name,
new_col_var=new_col_var,
dcids=dcids)
else:
query = ('SELECT ?{seed_col_name} ?{new_col_name},'
query = ('SELECT ?{seed_col_var} ?{new_col_var},'
'typeOf ?node {seed_col_type},'
'dcid ?node {dcids},'
'dcid ?node ?{seed_col_name},'
'{arc_name} ?{new_col_name} ?node').format(
'dcid ?node ?{seed_col_var},'
'{arc_name} ?{new_col_var} ?node').format(
arc_name=arc_name,
seed_col_name=seed_col_name,
seed_col_var=seed_col_var,
seed_col_type=seed_col_type,
new_col_name=new_col_name,
new_col_var=new_col_var,
dcids=dcids)

# Run the query and merge the results.
Expand All @@ -222,6 +225,8 @@ def expand(self,
query,
seed_col_name,
new_col_name,
seed_col_var,
new_col_var,
new_col_type,
max_rows=max_rows)

Expand Down Expand Up @@ -297,16 +302,19 @@ def get_populations(self,
pd_table[new_col_name] = ""
pd_table[new_col_name][0] = 'Population'
return pd_table
query = ('SELECT ?{seed_col_name} ?{new_col_name},'

seed_col_var = seed_col_name.replace(' ', '_')
new_col_var = new_col_name.replace(' ', '_')
query = ('SELECT ?{seed_col_var} ?{new_col_var},'
'typeOf ?node {seed_col_type},'
'typeOf ?pop Population,'
'dcid ?node {dcids},'
'dcid ?node ?{seed_col_name},'
'dcid ?node ?{seed_col_var},'
'location ?pop ?node,'
'dcid ?pop ?{new_col_name},'
'dcid ?pop ?{new_col_var},'
'populationType ?pop {population_type},').format(
new_col_name=new_col_name,
seed_col_name=seed_col_name,
new_col_var=new_col_var,
seed_col_var=seed_col_var,
seed_col_type=seed_col_type,
dcids=dcids,
population_type=population_type)
Expand All @@ -323,6 +331,8 @@ def get_populations(self,
query,
seed_col_name,
new_col_name,
seed_col_var,
new_col_var,
'Population',
max_rows=max_rows)

Expand Down Expand Up @@ -375,19 +385,22 @@ def get_observations(self,
pd_table[new_col_name] = ""
pd_table[new_col_name][0] = 'Observation'
return pd_table
query = ('SELECT ?{seed_col_name} ?{new_col_name},'

seed_col_var = seed_col_name.replace(' ', '_')
new_col_var = new_col_name.replace(' ', '_')
query = ('SELECT ?{seed_col_var} ?{new_col_var},'
'typeOf ?pop {seed_col_type},'
'typeOf ?o Observation,'
'dcid ?pop {dcids},'
'dcid ?pop ?{seed_col_name},'
'dcid ?pop ?{seed_col_var},'
'observedNode ?o ?pop,'
'startTime ?o {start_time},'
'endTime ?o {end_time},'
'measuredProperty ?o {measured_property},'
'{stats_type}Value ?o ?{new_col_name},').format(
'{stats_type}Value ?o ?{new_col_var},').format(
seed_col_type=seed_col_type,
new_col_name=new_col_name,
seed_col_name=seed_col_name,
new_col_var=new_col_var,
seed_col_var=seed_col_var,
dcids=dcids,
measured_property=measured_property,
stats_type=stats_type,
Expand All @@ -399,6 +412,8 @@ def get_observations(self,
query,
seed_col_name,
new_col_name,
seed_col_var,
new_col_var,
'Observation',
max_rows=max_rows)

Expand Down Expand Up @@ -564,6 +579,8 @@ def _query_and_merge(self,
query,
seed_col_name,
new_col_name,
seed_col_var,
new_col_var,
new_col_type,
max_rows=100):
"""A utility function that executes the given query and adds a new column.
Expand Down Expand Up @@ -591,6 +608,10 @@ def _query_and_merge(self,
except RuntimeError as e:
raise RuntimeError('Execute query \n%s\ngot an error:\n%s' % (query, e))

query_result = query_result.rename(
index=str,
columns={seed_col_var: seed_col_name, new_col_var: new_col_name})

new_data = pd.merge(
pd_table[1:], query_result, how='left', on=seed_col_name)
new_data[new_col_name] = new_data[new_col_name].fillna('')
Expand Down
12 changes: 6 additions & 6 deletions datacommons/examples/population_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def main():

# Add the state name and the 5 counties contained in that state
state_table = dc.expand(
state_table, 'name', 'state', 'state_name', outgoing=True)
state_table, 'name', 'state', 'state name', outgoing=True)
state_table = dc.expand(
state_table,
'containedInPlace',
Expand All @@ -36,12 +36,12 @@ def main():
outgoing=False,
max_rows=3)
state_table = dc.expand(
state_table, 'name', 'county', 'county_name', outgoing=True)
state_table, 'name', 'county', 'county name', outgoing=True)

state_table = dc.get_populations(
state_table,
seed_col_name='county',
new_col_name='county_population',
new_col_name='county population',
population_type='Person',
max_rows=100)
with pd.option_context('display.width', 400, 'display.max_rows', 100):
Expand All @@ -60,7 +60,7 @@ def main():
state_table = dc.get_populations(
state_table,
seed_col_name='county',
new_col_name='county_male_population',
new_col_name='county male population',
population_type='Person',
max_rows=100,
gender='Male')
Expand All @@ -69,8 +69,8 @@ def main():

state_table = dc.get_observations(
state_table,
seed_col_name='county_population',
new_col_name='county_person_count',
seed_col_name='county population',
new_col_name='county person count',
start_date='2012-01-01',
end_date='2016-01-01',
measured_property='count',
Expand Down