datacommonsorg · antaresc · Jul 10, 2019 · Oct 5, 2018 · Oct 10, 2018 · Oct 11, 2018
diff --git a/datacommons/bio.py b/datacommons/bio.py
@@ -116,7 +116,7 @@ def get_experiments(self, new_col_name, **kwargs):
     # Specify select and process functions to filter for biosample class and
     # terms. This enforces the paired-ness of term and class
     select = select_biosample_summary('?bioClass', '?bioTerm', classes, terms)
-    process = delete_column('?bioClass', '?bioTerm')
+    process = utils.delete_column('?bioClass', '?bioTerm')
   if 'lab_name' in kwargs:
     lab_names = ['"{}"'.format(name) for name in kwargs['lab_name']]
     query.add_constraint(new_col_var, 'lab', '?labNode')
@@ -291,9 +291,9 @@ def get_bed_lines(self, seed_col_name, prop_info=DEFAULT_BEDLINE_PROPS, **kwargs
   # If filters were specified, compose the filters and add a post processor if
   # necessary.
   if select_funcs:
-    select = compose_select(*select_funcs)
+    select = utils.compose_select(*select_funcs)
     if drop_cols:
-      process = delete_column(*drop_cols)
+      process = utils.delete_column(*drop_cols)
 
   # Perform the query and merge
   new_frame = DCFrame(datalog_query=query,
@@ -366,32 +366,3 @@ def select(row):
         return True
     return False
   return select
-
-def compose_select(*select_funcs):
-  """ Returns a filter function composed of the given selectors.
-
-  Args:
-    select_funcs: Functions to compose.
-
-  Returns:
-    A filter function which returns True iff all select_funcs return True.
-  """
-  def select(row):
-    return all(select_func(row) for select_func in select_funcs)
-  return select
-
-def delete_column(*cols):
-  """ Returns a function that deletes the given column from a frame.
-
-  Args:
-    cols: Columns to delete from the data frame.
-
-  Returns:
-    A function that deletes columns in the given Pandas DataFrame.
-  """
-  def process(pd_frame):
-    for col in cols:
-      if col in pd_frame:
-        pd_frame = pd_frame.drop(col, axis=1)
-    return pd_frame
-  return process
diff --git a/datacommons/datacommons.py b/datacommons/datacommons.py
@@ -113,7 +113,7 @@ def query(self, datalog_query, rows=100):
       RuntimeError: some problem with executing query (hint in the string)
     """
     assert self._inited, 'Initialization was unsuccessful, cannot execute Query'
-    
+
     # Append the options
     options = {}
     if self._db_path:
@@ -123,13 +123,13 @@ def query(self, datalog_query, rows=100):
 
     # Send the query to the DataCommons query service
     try:
-        response = self._service.query_table(body={
-            'query': datalog_query,
-            'options': options
-        }).execute()
+      response = self._service.query_table(body={
+          'query': datalog_query,
+          'options': options
+      }).execute()
     except Exception as e:
-        msg = 'Failed to execute query:\n  Query: {}\n  Error: {}'.format(datalog_query, e)
-        raise RuntimeError(msg)
+      msg = 'Failed to execute query:\n  Query: {}\n  Error: {}'.format(datalog_query, e)
+      raise RuntimeError(msg)
 
     # Format and return the result as a DCFrame
     header = response.get('header', [])
@@ -307,17 +307,21 @@ def types(self):
     """
     return self._col_types
 
-  def pandas(self, col_names=None):
+  def pandas(self, col_names=None, ignore_populations=False):
     """ Returns a copy of the data in this view as a Pandas DataFrame.
 
     Args:
       col_names: An optional list specifying which columns to extract.
+      ignore_populations: Ignores all columns that have type
+        StatisticalPopulation. col_names takes precedence over this argument
 
     Returns: A deep copy of the underlying Pandas DataFrame.
     """
-    if col_names:
-        return self._dataframe[col_names].copy()
-    return self._dataframe.copy()
+    if not col_names:
+      col_names = list(self._dataframe)
+    if ignore_populations:
+      col_names = list(filter(lambda name: self._col_types[name] != 'StatisticalPopulation', col_names))
+    return self._dataframe[col_names].copy()
 
   def csv(self, col_names=None):
     """ Returns the data in this view as a CSV string.
@@ -329,7 +333,7 @@ def csv(self, col_names=None):
       The DataFrame exported as a CSV string.
     """
     if col_names:
-        return self._dataframe[col_names].to_csv(index=False)
+      return self._dataframe[col_names].to_csv(index=False)
     return self._dataframe.to_csv(index=False)
 
   def tsv(self, col_names=None):
@@ -342,7 +346,7 @@ def tsv(self, col_names=None):
       The DataFrame exported as a TSV string.
     """
     if col_names:
-        return self._dataframe[col_names].to_csv(index=False, sep='\t')
+      return self._dataframe[col_names].to_csv(index=False, sep='\t')
     return self._dataframe.to_csv(index=False, sep='\t')
 
   def rename(self, labels):

diff --git a/datacommons/examples/analysis_populations.py b/datacommons/examples/analysis_populations.py
@@ -41,91 +41,58 @@ def print_pandas(example_num, df):
     print('\n')
 
 def main():
-
   frame_1 = datacommons.DCFrame() # establish generic df
   frame_1 = PopulationsExtension(frame_1) # add population features to df
 
   # Start by initializing a column of three US states: California, Kentucky, and
   # Maryland.
   frame_1.add_column('state_dcid', 'State', ['geoId/06', 'geoId/21', 'geoId/24'])
-  print(frame_1.pandas())
+  print_pandas(1, frame_1.pandas())
 
   # Name is an outgoing property of the State. We can call expand to populate a
   # column 'state_name' with names of states corresponding to dcids in the
   # 'state_dcid' column.
   frame_1.expand('name', 'state_dcid', 'state_name')
-  print(frame_1.pandas())
 
   # Get populations for state
   frame_1.get_populations(
       seed_col_name='state_dcid',
       new_col_name='state_population',
       population_type='Person',
       rows=100)
-  print(frame_1.pandas())
+  print_pandas(2, frame_1.pandas())
 
   frame_1.get_populations(
       seed_col_name='state_dcid',
-      new_col_name='state_18_24_years_population',
+      new_col_name='state_male_population',
       population_type='Person',
       rows=100,
-      age='USC/18To24Years')
-  print(frame_1.pandas())
+      gender='Male')
+  print_pandas(3, frame_1.pandas())
 
   frame_1.get_populations(
       seed_col_name='state_dcid',
-      new_col_name='state_male_population',
+      new_col_name='state_female_population',
       population_type='Person',
       rows=100,
-      gender='Male')
-  print(frame_1.pandas())
+      gender='Female')
+  print_pandas(3, frame_1.pandas())
 
   # Get observations on state populations
   frame_1.get_observations(
       seed_col_name='state_population',
       new_col_name='state_person_2016_count',
       observation_date='2016',
       measured_property='count')
+  print_pandas(4, frame_1.pandas())
 
-  # Add 3 counties contained in each state
-  frame_1.expand(
-      'containedInPlace',
-      'state_dcid',
-      'county_dcid',
-      new_col_type='County',
-      outgoing=False,
-      rows=3)
-  print(frame_1.pandas())
-
-  # Get populations for counties
-  frame_1.get_populations(
-      seed_col_name='county_dcid',
-      new_col_name='county_population',
-      population_type='Person',
-      rows=100)
-  print(frame_1.pandas())
+  # To ignore the population columns...
+  print_pandas(5, frame_1.pandas(ignore_populations=True))
 
-  frame_1.get_populations(
-      seed_col_name='county_dcid',
-      new_col_name='county_18_24_years_population',
-      population_type='Person',
-      rows=100,
-      age='USC/18To24Years')
-  print(frame_1.pandas())
+  # Print the max population count
+  print('Max population count...')
+  print(frame_1.pandas()['state_person_2016_count'].max())
 
-  frame_1.get_populations(
-      seed_col_name='county_dcid',
-      new_col_name='county_male_population',
-      population_type='Person',
-      rows=100,
-      gender='Male')
-  print(frame_1.pandas())
 
-  # Get observations on county populations
-  frame_1.get_observations(
-      seed_col_name='county_population',
-      new_col_name='county_person_2016_count',
-      observation_date='2016',
-      measured_property='count')
 if __name__ == '__main__':
   main()
diff --git a/datacommons/populations.py b/datacommons/populations.py
@@ -109,6 +109,7 @@ def get_observations(self,
                      observation_date,
                      measured_property,
                      stats_type=None,
+                     clean_data=True,
                      rows=100):
   """Create a new column with values for an observation of the given property.
   The current pandas dataframe should include a column containing population
@@ -122,6 +123,7 @@ def get_observations(self,
     observations_date: The date of the observation (in 'YYY-mm-dd' form).
     measured_property: observation measured property.
     stats_type: Statistical type like "Median"
+    clean_data: A flag to convert to numerical types and filter out any NaNs.
     rows: The maximum number of rows returned by the query results.
 
   Raises:
@@ -169,14 +171,21 @@ def get_observations(self,
   query.add_constraint('?o', 'observationDate', '\"{}\"'.format(observation_date))
   query.add_constraint('?o', 'measuredProperty', measured_property)
   query.add_constraint('?o', '{}Value'.format(stats_type), new_col_var)
-  measurementMethod = None
+  measurement_method = None
   if measured_property == 'prevalence':
-    measurementMethod = 'CDC_CrudePrevalence'
+    measurement_method = 'CDC_CrudePrevalence'
   elif measured_property == 'unemploymentRate':
-    measurementMethod = 'BLSSeasonallyUnadjusted'
-  if measurementMethod:
-    query.add_constraint('?o', 'measurementMethod', measurementMethod)
+    measurement_method = 'BLSSeasonallyUnadjusted'
+  if measurement_method:
+    query.add_constraint('?o', 'measurementMethod', measurement_method)
+
+  # Check if data should be cleaned
+  clean_func = None
+  if clean_data:
+    type_func = utils.convert_type(new_col_var, 'float')
+    nan_func = utils.drop_nan(new_col_var)
+    clean_func = utils.compose_process(type_func, nan_func)
 
   # Perform the query and merge the results
-  new_frame = DCFrame(datalog_query=query, labels=labels, type_hint=type_hint, rows=rows)
+  new_frame = DCFrame(datalog_query=query, labels=labels, process=clean_func, type_hint=type_hint, rows=rows)
   self.merge(new_frame)
diff --git a/datacommons/utils.py b/datacommons/utils.py
@@ -18,6 +18,8 @@
 
 from collections import OrderedDict
 
+import pandas as pd
+
 
 class MeasuredValue:
   """ An enumeration of valid measured values in the DataCommons graph.
@@ -89,3 +91,80 @@ def add_constraint(self, sub, pred, obj):
     if pred not in self._constraints[sub]:
       self._constraints[sub][pred] = []
     self._constraints[sub][pred].append(obj)
+
+
+# ------------------------ SELECT AND PROCESS HELPERS -------------------------
+
+
+def convert_type(col_names, dtype):
+  """ Converts values in a given column to the given type.
+
+  Args:
+    col_names: The column or columns to convert
+    dtype: Data type or a dictionary from column name to data type.
+
+  Returns: A process function that converts the column to a given type.
+  """
+  if isinstance(col_names, str):
+    col_names = [col_names]
+  def process(pd_frame):
+    for name in col_names:
+      pd_frame[name] = pd.to_numeric(pd_frame[name])
+    return pd_frame
+  return process
+
+def drop_nan(col_names):
+  """ Drops rows containing NAN as a value in columns in col_names.
+
+  Args:
+    col_names: single column name or a list of column names.
+  """
+  if isinstance(col_names, str):
+    col_names = [col_names]
+  def process(pd_frame):
+    return pd_frame.dropna(subset=col_names)
+  return process
+
+def delete_column(*cols):
+  """ Returns a function that deletes the given column from a frame.
+
+  Args:
+    cols: Columns to delete from the data frame.
+
+  Returns:
+    A function that deletes columns in the given Pandas DataFrame.
+  """
+  def process(pd_frame):
+    for col in cols:
+      if col in pd_frame:
+        pd_frame = pd_frame.drop(col, axis=1)
+    return pd_frame
+  return process
+
+def compose_select(*select_funcs):
+  """ Returns a filter function composed of the given selectors.
+
+  Args:
+    select_funcs: Functions to compose.
+
+  Returns:
+    A filter function which returns True iff all select_funcs return True.
+  """
+  def select(row):
+    return all(select_func(row) for select_func in select_funcs)
+  return select
+
+def compose_process(*process_funcs):
+  """ Returns a process function composed of the given functions.
+
+  Args:
+    process_funcs: Functions to compose.
+
+  Returns:
+    A process function which performs each function in the order given.
+  """
+  def process(pd_frame):
+    for process_func in process_funcs:
+      pd_frame = process_func(pd_frame)
+    return pd_frame
+  return process