dataframes

dataframes#

# import the pandas library for working with tabular (spreadsheet) data

import pandas as pd

We will first work in our data in the form of a dataframe. A dataframe is a two-dimensional data structure that holds data in a table with rows and columns. It is the same as a dictionary or dict type of data.

# pulling up a CSV of search results from congress.gov 
# if this code doesn't work, download the csv from link 
# below, and upload it to your google colab space
# https://bit.ly/congress_csv

df = pd.read_csv('https://bit.ly/transgender_raw_data')

---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
Cell In[2], line 6
      1 # pulling up a CSV of search results from congress.gov 
      2 # if this code doesn't work, download the csv from link 
      3 # below, and upload it to your google colab space
      4 # https://bit.ly/congress_csv
----> 6 df = pd.read_csv('https://bit.ly/transgender_raw_data')

File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    944     dtype_backend=dtype_backend,
    945 )
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:611, in _read(filepath_or_buffer, kwds)
    608 _validate_names(kwds.get("names", None))
    610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
    613 if chunksize or iterator:
    614     return parser

File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
   1445     self.options["has_index_names"] = kwds["has_index_names"]
   1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)

File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1705, in TextFileReader._make_engine(self, f, engine)
   1703     if "b" not in mode:
   1704         mode += "b"
-> 1705 self.handles = get_handle(
   1706     f,
   1707     mode,
   1708     encoding=self.options.get("encoding", None),
   1709     compression=self.options.get("compression", None),
   1710     memory_map=self.options.get("memory_map", False),
   1711     is_text=is_text,
   1712     errors=self.options.get("encoding_errors", "strict"),
   1713     storage_options=self.options.get("storage_options", None),
   1714 )
   1715 assert self.handles is not None
   1716 f = self.handles.handle

File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:718, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    715     codecs.lookup_error(errors)
    717 # open URLs
--> 718 ioargs = _get_filepath_or_buffer(
    719     path_or_buf,
    720     encoding=encoding,
    721     compression=compression,
    722     mode=mode,
    723     storage_options=storage_options,
    724 )
    726 handle = ioargs.filepath_or_buffer
    727 handles: list[BaseBuffer]

File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:372, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    370 # assuming storage_options is to be interpreted as headers
    371 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 372 with urlopen(req_info) as req:
    373     content_encoding = req.headers.get("Content-Encoding", None)
    374     if content_encoding == "gzip":
    375         # Override compression based on Content-Encoding header

File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:274, in urlopen(*args, **kwargs)
    268 """
    269 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
    270 the stdlib.
    271 """
    272 import urllib.request
--> 274 return urllib.request.urlopen(*args, **kwargs)

File /opt/anaconda3/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    214 else:
    215     opener = _opener
--> 216 return opener.open(url, data, timeout)

File /opt/anaconda3/lib/python3.11/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout)
    523 for processor in self.process_response.get(protocol, []):
    524     meth = getattr(processor, meth_name)
--> 525     response = meth(req, response)
    527 return response

File /opt/anaconda3/lib/python3.11/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response)
    631 # According to RFC 2616, "2xx" code indicates that the client's
    632 # request was successfully received, understood, and accepted.
    633 if not (200 <= code < 300):
--> 634     response = self.parent.error(
    635         'http', request, response, code, msg, hdrs)
    637 return response

File /opt/anaconda3/lib/python3.11/urllib/request.py:557, in OpenerDirector.error(self, proto, *args)
    555     http_err = 0
    556 args = (dict, proto, meth_name) + args
--> 557 result = self._call_chain(*args)
    558 if result:
    559     return result

File /opt/anaconda3/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    494 for handler in handlers:
    495     func = getattr(handler, meth_name)
--> 496     result = func(*args)
    497     if result is not None:
    498         return result

File /opt/anaconda3/lib/python3.11/urllib/request.py:749, in HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
    746 fp.read()
    747 fp.close()
--> 749 return self.parent.open(new, timeout=req.timeout)

File /opt/anaconda3/lib/python3.11/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout)
    523 for processor in self.process_response.get(protocol, []):
    524     meth = getattr(processor, meth_name)
--> 525     response = meth(req, response)
    527 return response

File /opt/anaconda3/lib/python3.11/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response)
    631 # According to RFC 2616, "2xx" code indicates that the client's
    632 # request was successfully received, understood, and accepted.
    633 if not (200 <= code < 300):
--> 634     response = self.parent.error(
    635         'http', request, response, code, msg, hdrs)
    637 return response

File /opt/anaconda3/lib/python3.11/urllib/request.py:563, in OpenerDirector.error(self, proto, *args)
    561 if http_err:
    562     args = (dict, 'default', 'http_error_default') + orig_args
--> 563     return self._call_chain(*args)

File /opt/anaconda3/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    494 for handler in handlers:
    495     func = getattr(handler, meth_name)
--> 496     result = func(*args)
    497     if result is not None:
    498         return result

File /opt/anaconda3/lib/python3.11/urllib/request.py:643, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
    642 def http_error_default(self, req, fp, code, msg, hdrs):
--> 643     raise HTTPError(req.full_url, code, msg, hdrs, fp)

HTTPError: HTTP Error 404: Not Found

# checking out the df object

df

	Legislation Number	URL	Congress	Title	Sponsor	Party of Sponsor	Date of Introduction	Committees	Latest Action	Latest Action Date	...	Related Bill.211	Related Bill.212	Related Bill.213	Latest Summary	Amends Bill	Date Offered	Date Submitted	Date Proposed	Amendment Text (Latest)	Amends Amendment
0	H.R. 1112	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Ensuring Military Readiness Act of 2023	Banks, Jim [Rep.-R-IN-3]	Republican	2/21/23	House - Armed Services	Referred to the House Committee on Armed Servi...	2/21/23	...	NaN	NaN	NaN	<p><b>Ensuring Military Readiness Act of 2023...	NaN	NaN	NaN	NaN	NaN	NaN
1	S. 435	https://www.congress.gov/bill/118th-congress/s...	118th Congress (2023-2024)	Ensuring Military Readiness Act of 2023	Rubio, Marco [Sen.-R-FL]	Republican	2/15/23	Senate - Armed Services	Read twice and referred to the Committee on Ar...	2/15/23	...	NaN	NaN	NaN	<p><b>Ensuring Military Readiness Act of 2023...	NaN	NaN	NaN	NaN	NaN	NaN
2	H.Res. 886	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Supporting the goals and principles of Transge...	Jayapal, Pramila [Rep.-D-WA-7]	Democratic	11/21/23	House - Judiciary	Referred to the House Committee on the Judiciary.	11/21/23	...	NaN	NaN	NaN	<p>This resolution expresses support for the ...	NaN	NaN	NaN	NaN	NaN	NaN
3	S.Res. 464	https://www.congress.gov/bill/118th-congress/s...	118th Congress (2023-2024)	A resolution supporting the goals and principl...	Hirono, Mazie K. [Sen.-D-HI]	Democratic	11/15/23	Senate - Judiciary	Star Print ordered on resolution.	12/4/23	...	NaN	NaN	NaN	<p>This resolution expresses support for the ...	NaN	NaN	NaN	NaN	NaN	NaN
4	H.Res. 269	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Recognizing that it is the duty of the Federal...	Jayapal, Pramila [Rep.-D-WA-7]	Democratic	3/30/23	House - Judiciary, Education and the Workforce...	Sponsor introductory remarks on measure. (CR H...	4/19/23	...	NaN	NaN	NaN	<p>This resolution expresses support for impl...	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
281	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	I welcome that discussion.\n Mr. Chair, I yie...	NaN
282	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	biotechnology equipment or service produced or...	NaN
283	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	United States assistance has \n been provi...	NaN
284	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Director of National Intelligence, shall submi...	NaN
285	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	demanded.\n A recorded vote was ordered.\n T...	NaN

286 rows × 650 columns

What is a `dataframe` object?#

tabular format
dictionary structure, with a particular syntax for accessing elements
- key-value pairs
- df[‘column’]

filipa = {
    'name': ['filipa', 'da gama', 'calado'],
    'age': 34,
    'degree': 'literature',
    'job': 'digital scholarship specialist',
}

type(filipa)

dict

filipa['name']

['filipa', 'da gama', 'calado']

filipa['age']

filipa['job']

'digital scholarship specialist'

# within a DF is a Series, a list type of object. Use list indexing to pull out items

filipa['name'][0]

'filipa'

filipa['name'][2]

'calado'

viewing data: `info()` `head()` `tail()`#

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Columns: 650 entries, Legislation Number to Amends Amendment
dtypes: float64(4), object(646)
memory usage: 1.4+ MB

df.head()

	Legislation Number	URL	Congress	Title	Sponsor	Party of Sponsor	Date of Introduction	Committees	Latest Action	Latest Action Date	...	Related Bill.211	Related Bill.212	Related Bill.213	Latest Summary	Amends Bill	Date Offered	Date Submitted	Date Proposed	Amendment Text (Latest)	Amends Amendment
0	H.R. 1112	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Ensuring Military Readiness Act of 2023	Banks, Jim [Rep.-R-IN-3]	Republican	2/21/23	House - Armed Services	Referred to the House Committee on Armed Servi...	2/21/23	...	NaN	NaN	NaN	<p><b>Ensuring Military Readiness Act of 2023...	NaN	NaN	NaN	NaN	NaN	NaN
1	S. 435	https://www.congress.gov/bill/118th-congress/s...	118th Congress (2023-2024)	Ensuring Military Readiness Act of 2023	Rubio, Marco [Sen.-R-FL]	Republican	2/15/23	Senate - Armed Services	Read twice and referred to the Committee on Ar...	2/15/23	...	NaN	NaN	NaN	<p><b>Ensuring Military Readiness Act of 2023...	NaN	NaN	NaN	NaN	NaN	NaN
2	H.Res. 886	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Supporting the goals and principles of Transge...	Jayapal, Pramila [Rep.-D-WA-7]	Democratic	11/21/23	House - Judiciary	Referred to the House Committee on the Judiciary.	11/21/23	...	NaN	NaN	NaN	<p>This resolution expresses support for the ...	NaN	NaN	NaN	NaN	NaN	NaN
3	S.Res. 464	https://www.congress.gov/bill/118th-congress/s...	118th Congress (2023-2024)	A resolution supporting the goals and principl...	Hirono, Mazie K. [Sen.-D-HI]	Democratic	11/15/23	Senate - Judiciary	Star Print ordered on resolution.	12/4/23	...	NaN	NaN	NaN	<p>This resolution expresses support for the ...	NaN	NaN	NaN	NaN	NaN	NaN
4	H.Res. 269	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Recognizing that it is the duty of the Federal...	Jayapal, Pramila [Rep.-D-WA-7]	Democratic	3/30/23	House - Judiciary, Education and the Workforce...	Sponsor introductory remarks on measure. (CR H...	4/19/23	...	NaN	NaN	NaN	<p>This resolution expresses support for impl...	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 650 columns

df.tail()

	Legislation Number	URL	Congress	Title	Sponsor	Party of Sponsor	Date of Introduction	Committees	Latest Action	Latest Action Date	...	Related Bill.211	Related Bill.212	Related Bill.213	Latest Summary	Amends Bill	Date Offered	Date Submitted	Date Proposed	Amendment Text (Latest)	Amends Amendment
281	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	I welcome that discussion.\n Mr. Chair, I yie...	NaN
282	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	biotechnology equipment or service produced or...	NaN
283	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	United States assistance has \n been provi...	NaN
284	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Director of National Intelligence, shall submi...	NaN
285	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	demanded.\n A recorded vote was ordered.\n T...	NaN

5 rows × 650 columns

We can also explore columns using the dictionary syntax to access each column

df['Date of Introduction']

     2/21/23
     2/15/23
    11/21/23
    11/15/23
     3/30/23
         ...   
       NaN
       NaN
       NaN
       NaN
       NaN
Name: Date of Introduction, Length: 286, dtype: object

cleaning rows & columns: `dropna()` `.notna()` `drop()`#

# only drops rows containing all NaN: missing value
# this doesn't work for our dataset. Why?

df.dropna()

	Legislation Number	URL	Congress	Title	Sponsor	Party of Sponsor	Date of Introduction	Committees	Latest Action	Latest Action Date	...	Related Bill.211	Related Bill.212	Related Bill.213	Latest Summary	Amends Bill	Date Offered	Date Submitted	Date Proposed	Amendment Text (Latest)	Amends Amendment

0 rows × 650 columns

# check the tail. There are still some values there.

df.tail()

	Legislation Number	URL	Congress	Title	Sponsor	Party of Sponsor	Date of Introduction	Committees	Latest Action	Latest Action Date	...	Related Bill.211	Related Bill.212	Related Bill.213	Latest Summary	Amends Bill	Date Offered	Date Submitted	Date Proposed	Amendment Text (Latest)	Amends Amendment
281	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	I welcome that discussion.\n Mr. Chair, I yie...	NaN
282	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	biotechnology equipment or service produced or...	NaN
283	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	United States assistance has \n been provi...	NaN
284	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Director of National Intelligence, shall submi...	NaN
285	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	demanded.\n A recorded vote was ordered.\n T...	NaN

5 rows × 650 columns

# check df info. We still have the same number of entries (rows)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Columns: 650 entries, Legislation Number to Amends Amendment
dtypes: float64(4), object(646)
memory usage: 1.4+ MB

# try isolating a column, dropping a row if it has any NaN within
# uses syntax for accessing columns from DF

df = df[df['Legislation Number'].notna()]

# notice the line with "entries"
# we have 148 rows, but our index goes up to 280

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Columns: 650 entries, Legislation Number to Amends Amendment
dtypes: float64(4), object(646)
memory usage: 752.7+ KB

# what's in the columns? 
# googled how to print out all of the column names

df.columns.values

array(['Legislation Number', 'URL', 'Congress', 'Title', 'Sponsor',
       'Party of Sponsor', 'Date of Introduction', 'Committees',
       'Latest Action', 'Latest Action Date', 'Cosponsor', 'Cosponsor.1',
       'Cosponsor.2', 'Cosponsor.3', 'Cosponsor.4', 'Cosponsor.5',
       'Cosponsor.6', 'Cosponsor.7', 'Cosponsor.8', 'Cosponsor.9',
       'Cosponsor.10', 'Cosponsor.11', 'Cosponsor.12', 'Cosponsor.13',
       'Cosponsor.14', 'Cosponsor.15', 'Cosponsor.16', 'Cosponsor.17',
       'Cosponsor.18', 'Cosponsor.19', 'Cosponsor.20', 'Cosponsor.21',
       'Cosponsor.22', 'Cosponsor.23', 'Cosponsor.24', 'Cosponsor.25',
       'Cosponsor.26', 'Cosponsor.27', 'Cosponsor.28', 'Cosponsor.29',
       'Cosponsor.30', 'Cosponsor.31', 'Cosponsor.32', 'Cosponsor.33',
       'Cosponsor.34', 'Cosponsor.35', 'Cosponsor.36', 'Cosponsor.37',
       'Cosponsor.38', 'Cosponsor.39', 'Cosponsor.40', 'Cosponsor.41',
       'Cosponsor.42', 'Cosponsor.43', 'Cosponsor.44', 'Cosponsor.45',
       'Cosponsor.46', 'Cosponsor.47', 'Cosponsor.48', 'Cosponsor.49',
       'Cosponsor.50', 'Cosponsor.51', 'Cosponsor.52', 'Cosponsor.53',
       'Cosponsor.54', 'Cosponsor.55', 'Cosponsor.56', 'Cosponsor.57',
       'Cosponsor.58', 'Cosponsor.59', 'Cosponsor.60', 'Cosponsor.61',
       'Cosponsor.62', 'Cosponsor.63', 'Cosponsor.64', 'Cosponsor.65',
       'Cosponsor.66', 'Cosponsor.67', 'Cosponsor.68', 'Cosponsor.69',
       'Cosponsor.70', 'Cosponsor.71', 'Cosponsor.72', 'Cosponsor.73',
       'Cosponsor.74', 'Cosponsor.75', 'Cosponsor.76', 'Cosponsor.77',
       'Cosponsor.78', 'Cosponsor.79', 'Cosponsor.80', 'Cosponsor.81',
       'Cosponsor.82', 'Cosponsor.83', 'Cosponsor.84', 'Cosponsor.85',
       'Cosponsor.86', 'Cosponsor.87', 'Cosponsor.88', 'Cosponsor.89',
       'Cosponsor.90', 'Cosponsor.91', 'Cosponsor.92', 'Cosponsor.93',
       'Cosponsor.94', 'Cosponsor.95', 'Cosponsor.96', 'Cosponsor.97',
       'Cosponsor.98', 'Cosponsor.99', 'Cosponsor.100', 'Cosponsor.101',
       'Cosponsor.102', 'Cosponsor.103', 'Cosponsor.104', 'Cosponsor.105',
       'Cosponsor.106', 'Cosponsor.107', 'Cosponsor.108', 'Cosponsor.109',
       'Cosponsor.110', 'Cosponsor.111', 'Cosponsor.112', 'Cosponsor.113',
       'Cosponsor.114', 'Cosponsor.115', 'Cosponsor.116', 'Cosponsor.117',
       'Cosponsor.118', 'Cosponsor.119', 'Cosponsor.120', 'Cosponsor.121',
       'Cosponsor.122', 'Cosponsor.123', 'Cosponsor.124', 'Cosponsor.125',
       'Cosponsor.126', 'Cosponsor.127', 'Cosponsor.128', 'Cosponsor.129',
       'Cosponsor.130', 'Cosponsor.131', 'Cosponsor.132', 'Cosponsor.133',
       'Cosponsor.134', 'Cosponsor.135', 'Cosponsor.136', 'Cosponsor.137',
       'Cosponsor.138', 'Cosponsor.139', 'Cosponsor.140', 'Cosponsor.141',
       'Cosponsor.142', 'Cosponsor.143', 'Cosponsor.144', 'Cosponsor.145',
       'Cosponsor.146', 'Cosponsor.147', 'Cosponsor.148', 'Cosponsor.149',
       'Cosponsor.150', 'Cosponsor.151', 'Cosponsor.152', 'Cosponsor.153',
       'Cosponsor.154', 'Cosponsor.155', 'Cosponsor.156', 'Cosponsor.157',
       'Cosponsor.158', 'Cosponsor.159', 'Cosponsor.160', 'Cosponsor.161',
       'Cosponsor.162', 'Cosponsor.163', 'Cosponsor.164', 'Cosponsor.165',
       'Cosponsor.166', 'Cosponsor.167', 'Cosponsor.168', 'Cosponsor.169',
       'Cosponsor.170', 'Cosponsor.171', 'Cosponsor.172', 'Cosponsor.173',
       'Cosponsor.174', 'Cosponsor.175', 'Cosponsor.176', 'Cosponsor.177',
       'Cosponsor.178', 'Cosponsor.179', 'Cosponsor.180', 'Cosponsor.181',
       'Cosponsor.182', 'Cosponsor.183', 'Cosponsor.184', 'Cosponsor.185',
       'Cosponsor.186', 'Cosponsor.187', 'Cosponsor.188', 'Cosponsor.189',
       'Cosponsor.190', 'Cosponsor.191', 'Cosponsor.192', 'Cosponsor.193',
       'Cosponsor.194', 'Cosponsor.195', 'Cosponsor.196', 'Cosponsor.197',
       'Cosponsor.198', 'Cosponsor.199', 'Cosponsor.200', 'Cosponsor.201',
       'Cosponsor.202', 'Cosponsor.203', 'Cosponsor.204', 'Cosponsor.205',
       'Cosponsor.206', 'Cosponsor.207', 'Cosponsor.208', 'Cosponsor.209',
       'Cosponsor.210', 'Cosponsor.211', 'Cosponsor.212', 'Cosponsor.213',
       'Cosponsor.214', 'Number of Cosponsors', 'Subject', 'Subject.1',
       'Subject.2', 'Subject.3', 'Subject.4', 'Subject.5', 'Subject.6',
       'Subject.7', 'Subject.8', 'Subject.9', 'Subject.10', 'Subject.11',
       'Subject.12', 'Subject.13', 'Subject.14', 'Subject.15',
       'Subject.16', 'Subject.17', 'Subject.18', 'Subject.19',
       'Subject.20', 'Subject.21', 'Subject.22', 'Subject.23',
       'Subject.24', 'Subject.25', 'Subject.26', 'Subject.27',
       'Subject.28', 'Subject.29', 'Subject.30', 'Subject.31',
       'Subject.32', 'Subject.33', 'Subject.34', 'Subject.35',
       'Subject.36', 'Subject.37', 'Subject.38', 'Subject.39',
       'Subject.40', 'Subject.41', 'Subject.42', 'Subject.43',
       'Subject.44', 'Subject.45', 'Subject.46', 'Subject.47',
       'Subject.48', 'Subject.49', 'Subject.50', 'Subject.51',
       'Subject.52', 'Subject.53', 'Subject.54', 'Subject.55',
       'Subject.56', 'Subject.57', 'Subject.58', 'Subject.59',
       'Subject.60', 'Subject.61', 'Subject.62', 'Subject.63',
       'Subject.64', 'Subject.65', 'Subject.66', 'Subject.67',
       'Subject.68', 'Subject.69', 'Subject.70', 'Subject.71',
       'Subject.72', 'Subject.73', 'Subject.74', 'Subject.75',
       'Subject.76', 'Subject.77', 'Subject.78', 'Subject.79',
       'Subject.80', 'Subject.81', 'Subject.82', 'Subject.83',
       'Subject.84', 'Subject.85', 'Subject.86', 'Subject.87',
       'Subject.88', 'Subject.89', 'Subject.90', 'Subject.91',
       'Subject.92', 'Subject.93', 'Subject.94', 'Subject.95',
       'Subject.96', 'Subject.97', 'Subject.98', 'Subject.99',
       'Subject.100', 'Subject.101', 'Subject.102', 'Subject.103',
       'Subject.104', 'Subject.105', 'Subject.106', 'Subject.107',
       'Subject.108', 'Subject.109', 'Subject.110', 'Subject.111',
       'Subject.112', 'Subject.113', 'Subject.114', 'Subject.115',
       'Subject.116', 'Subject.117', 'Subject.118', 'Subject.119',
       'Subject.120', 'Subject.121', 'Subject.122', 'Subject.123',
       'Subject.124', 'Subject.125', 'Subject.126', 'Subject.127',
       'Subject.128', 'Subject.129', 'Subject.130', 'Subject.131',
       'Subject.132', 'Subject.133', 'Subject.134', 'Subject.135',
       'Subject.136', 'Subject.137', 'Subject.138', 'Subject.139',
       'Subject.140', 'Subject.141', 'Subject.142', 'Subject.143',
       'Subject.144', 'Subject.145', 'Subject.146', 'Subject.147',
       'Subject.148', 'Subject.149', 'Subject.150', 'Subject.151',
       'Subject.152', 'Subject.153', 'Subject.154', 'Subject.155',
       'Subject.156', 'Subject.157', 'Subject.158', 'Subject.159',
       'Subject.160', 'Subject.161', 'Subject.162', 'Subject.163',
       'Subject.164', 'Subject.165', 'Subject.166', 'Subject.167',
       'Subject.168', 'Subject.169', 'Subject.170', 'Subject.171',
       'Subject.172', 'Subject.173', 'Subject.174', 'Subject.175',
       'Subject.176', 'Subject.177', 'Subject.178', 'Subject.179',
       'Subject.180', 'Subject.181', 'Subject.182', 'Subject.183',
       'Subject.184', 'Subject.185', 'Subject.186', 'Subject.187',
       'Subject.188', 'Subject.189', 'Subject.190', 'Subject.191',
       'Subject.192', 'Subject.193', 'Subject.194', 'Subject.195',
       'Subject.196', 'Subject.197', 'Subject.198', 'Subject.199',
       'Subject.200', 'Subject.201', 'Number of Related Bills',
       'Related Bill', 'Related Bill.1', 'Related Bill.2',
       'Related Bill.3', 'Related Bill.4', 'Related Bill.5',
       'Related Bill.6', 'Related Bill.7', 'Related Bill.8',
       'Related Bill.9', 'Related Bill.10', 'Related Bill.11',
       'Related Bill.12', 'Related Bill.13', 'Related Bill.14',
       'Related Bill.15', 'Related Bill.16', 'Related Bill.17',
       'Related Bill.18', 'Related Bill.19', 'Related Bill.20',
       'Related Bill.21', 'Related Bill.22', 'Related Bill.23',
       'Related Bill.24', 'Related Bill.25', 'Related Bill.26',
       'Related Bill.27', 'Related Bill.28', 'Related Bill.29',
       'Related Bill.30', 'Related Bill.31', 'Related Bill.32',
       'Related Bill.33', 'Related Bill.34', 'Related Bill.35',
       'Related Bill.36', 'Related Bill.37', 'Related Bill.38',
       'Related Bill.39', 'Related Bill.40', 'Related Bill.41',
       'Related Bill.42', 'Related Bill.43', 'Related Bill.44',
       'Related Bill.45', 'Related Bill.46', 'Related Bill.47',
       'Related Bill.48', 'Related Bill.49', 'Related Bill.50',
       'Related Bill.51', 'Related Bill.52', 'Related Bill.53',
       'Related Bill.54', 'Related Bill.55', 'Related Bill.56',
       'Related Bill.57', 'Related Bill.58', 'Related Bill.59',
       'Related Bill.60', 'Related Bill.61', 'Related Bill.62',
       'Related Bill.63', 'Related Bill.64', 'Related Bill.65',
       'Related Bill.66', 'Related Bill.67', 'Related Bill.68',
       'Related Bill.69', 'Related Bill.70', 'Related Bill.71',
       'Related Bill.72', 'Related Bill.73', 'Related Bill.74',
       'Related Bill.75', 'Related Bill.76', 'Related Bill.77',
       'Related Bill.78', 'Related Bill.79', 'Related Bill.80',
       'Related Bill.81', 'Related Bill.82', 'Related Bill.83',
       'Related Bill.84', 'Related Bill.85', 'Related Bill.86',
       'Related Bill.87', 'Related Bill.88', 'Related Bill.89',
       'Related Bill.90', 'Related Bill.91', 'Related Bill.92',
       'Related Bill.93', 'Related Bill.94', 'Related Bill.95',
       'Related Bill.96', 'Related Bill.97', 'Related Bill.98',
       'Related Bill.99', 'Related Bill.100', 'Related Bill.101',
       'Related Bill.102', 'Related Bill.103', 'Related Bill.104',
       'Related Bill.105', 'Related Bill.106', 'Related Bill.107',
       'Related Bill.108', 'Related Bill.109', 'Related Bill.110',
       'Related Bill.111', 'Related Bill.112', 'Related Bill.113',
       'Related Bill.114', 'Related Bill.115', 'Related Bill.116',
       'Related Bill.117', 'Related Bill.118', 'Related Bill.119',
       'Related Bill.120', 'Related Bill.121', 'Related Bill.122',
       'Related Bill.123', 'Related Bill.124', 'Related Bill.125',
       'Related Bill.126', 'Related Bill.127', 'Related Bill.128',
       'Related Bill.129', 'Related Bill.130', 'Related Bill.131',
       'Related Bill.132', 'Related Bill.133', 'Related Bill.134',
       'Related Bill.135', 'Related Bill.136', 'Related Bill.137',
       'Related Bill.138', 'Related Bill.139', 'Related Bill.140',
       'Related Bill.141', 'Related Bill.142', 'Related Bill.143',
       'Related Bill.144', 'Related Bill.145', 'Related Bill.146',
       'Related Bill.147', 'Related Bill.148', 'Related Bill.149',
       'Related Bill.150', 'Related Bill.151', 'Related Bill.152',
       'Related Bill.153', 'Related Bill.154', 'Related Bill.155',
       'Related Bill.156', 'Related Bill.157', 'Related Bill.158',
       'Related Bill.159', 'Related Bill.160', 'Related Bill.161',
       'Related Bill.162', 'Related Bill.163', 'Related Bill.164',
       'Related Bill.165', 'Related Bill.166', 'Related Bill.167',
       'Related Bill.168', 'Related Bill.169', 'Related Bill.170',
       'Related Bill.171', 'Related Bill.172', 'Related Bill.173',
       'Related Bill.174', 'Related Bill.175', 'Related Bill.176',
       'Related Bill.177', 'Related Bill.178', 'Related Bill.179',
       'Related Bill.180', 'Related Bill.181', 'Related Bill.182',
       'Related Bill.183', 'Related Bill.184', 'Related Bill.185',
       'Related Bill.186', 'Related Bill.187', 'Related Bill.188',
       'Related Bill.189', 'Related Bill.190', 'Related Bill.191',
       'Related Bill.192', 'Related Bill.193', 'Related Bill.194',
       'Related Bill.195', 'Related Bill.196', 'Related Bill.197',
       'Related Bill.198', 'Related Bill.199', 'Related Bill.200',
       'Related Bill.201', 'Related Bill.202', 'Related Bill.203',
       'Related Bill.204', 'Related Bill.205', 'Related Bill.206',
       'Related Bill.207', 'Related Bill.208', 'Related Bill.209',
       'Related Bill.210', 'Related Bill.211', 'Related Bill.212',
       'Related Bill.213', 'Latest Summary', 'Amends Bill',
       'Date Offered', 'Date Submitted', 'Date Proposed',
       'Amendment Text (Latest)', 'Amends Amendment'], dtype=object)

for item in df.columns.values:
    if 'Cosponsor' in item:
        # reassigning DF in each step of the loop
        # axis = 1 means columns; axis = 0 means rows
        df = df.drop(item, axis=1)

# We are down to about 400. let's do the rest

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Columns: 434 entries, Legislation Number to Amends Amendment
dtypes: float64(3), object(431)
memory usage: 503.0+ KB

# see there is no ellipses anymore on the column headers

df.head()

	Legislation Number	URL	Congress	Title	Sponsor	Party of Sponsor	Date of Introduction	Committees	Latest Action	Latest Action Date	...	Related Bill.211	Related Bill.212	Related Bill.213	Latest Summary	Amends Bill	Date Offered	Date Submitted	Date Proposed	Amendment Text (Latest)	Amends Amendment
0	H.R. 1112	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Ensuring Military Readiness Act of 2023	Banks, Jim [Rep.-R-IN-3]	Republican	2/21/23	House - Armed Services	Referred to the House Committee on Armed Servi...	2/21/23	...	NaN	NaN	NaN	<p><b>Ensuring Military Readiness Act of 2023...	NaN	NaN	NaN	NaN	NaN	NaN
1	S. 435	https://www.congress.gov/bill/118th-congress/s...	118th Congress (2023-2024)	Ensuring Military Readiness Act of 2023	Rubio, Marco [Sen.-R-FL]	Republican	2/15/23	Senate - Armed Services	Read twice and referred to the Committee on Ar...	2/15/23	...	NaN	NaN	NaN	<p><b>Ensuring Military Readiness Act of 2023...	NaN	NaN	NaN	NaN	NaN	NaN
2	H.Res. 886	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Supporting the goals and principles of Transge...	Jayapal, Pramila [Rep.-D-WA-7]	Democratic	11/21/23	House - Judiciary	Referred to the House Committee on the Judiciary.	11/21/23	...	NaN	NaN	NaN	<p>This resolution expresses support for the ...	NaN	NaN	NaN	NaN	NaN	NaN
3	S.Res. 464	https://www.congress.gov/bill/118th-congress/s...	118th Congress (2023-2024)	A resolution supporting the goals and principl...	Hirono, Mazie K. [Sen.-D-HI]	Democratic	11/15/23	Senate - Judiciary	Star Print ordered on resolution.	12/4/23	...	NaN	NaN	NaN	<p>This resolution expresses support for the ...	NaN	NaN	NaN	NaN	NaN	NaN
4	H.Res. 269	https://www.congress.gov/bill/118th-congress/h...	118th Congress (2023-2024)	Recognizing that it is the duty of the Federal...	Jayapal, Pramila [Rep.-D-WA-7]	Democratic	3/30/23	House - Judiciary, Education and the Workforce...	Sponsor introductory remarks on measure. (CR H...	4/19/23	...	NaN	NaN	NaN	<p>This resolution expresses support for impl...	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 434 columns

How would we get rid of the rest of the columns that are mostly NaN?

for item in df.columns.values:
    if 'Subject' in item:
        df = df.drop(item, axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Columns: 232 entries, Legislation Number to Amends Amendment
dtypes: float64(3), object(229)
memory usage: 269.4+ KB

for item in df.columns.values:
    if 'Related' in item:
        df = df.drop(item, axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 Legislation Number       148 non-null    object 
 URL                      148 non-null    object 
 Congress                 148 non-null    object 
 Title                    146 non-null    object 
 Sponsor                  148 non-null    object 
 Party of Sponsor         148 non-null    object 
 Date of Introduction     115 non-null    object 
 Committees               114 non-null    object 
 Latest Action            146 non-null    object 
 Latest Action Date       146 non-null    object 
Latest Summary           61 non-null     object 
Amends Bill              33 non-null     object 
Date Offered             31 non-null     object 
Date Submitted           2 non-null      object 
Date Proposed            0 non-null      float64
Amendment Text (Latest)  33 non-null     object 
Amends Amendment         0 non-null      float64
dtypes: float64(2), object(15)
memory usage: 20.8+ KB

And now we have a much cleaner csv file. We can save it to our workspace with the following line:

df.to_csv('congress_clean.csv')

dataframes

Contents

dataframes#

What is a dataframe object?#

viewing data: info() head() tail()#

cleaning rows & columns: dropna() .notna() drop()#

What is a `dataframe` object?#

viewing data: `info()` `head()` `tail()`#

cleaning rows & columns: `dropna()` `.notna()` `drop()`#