dataframes#
# import the pandas library for working with tabular (spreadsheet) data
import pandas as pd
We will first work in our data in the form of a dataframe. A dataframe is a two-dimensional data structure that holds data in a table with rows and columns. It is the same as a dictionary or dict
type of data.
# pulling up a CSV of search results from congress.gov
# if this code doesn't work, download the csv from link
# below, and upload it to your google colab space
# https://bit.ly/congress_csv
df = pd.read_csv('https://bit.ly/transgender_raw_data')
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
Cell In[2], line 6
1 # pulling up a CSV of search results from congress.gov
2 # if this code doesn't work, download the csv from link
3 # below, and upload it to your google colab space
4 # https://bit.ly/congress_csv
----> 6 df = pd.read_csv('https://bit.ly/transgender_raw_data')
File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
944 dtype_backend=dtype_backend,
945 )
946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)
File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:611, in _read(filepath_or_buffer, kwds)
608 _validate_names(kwds.get("names", None))
610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
613 if chunksize or iterator:
614 return parser
File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
1445 self.options["has_index_names"] = kwds["has_index_names"]
1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)
File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1705, in TextFileReader._make_engine(self, f, engine)
1703 if "b" not in mode:
1704 mode += "b"
-> 1705 self.handles = get_handle(
1706 f,
1707 mode,
1708 encoding=self.options.get("encoding", None),
1709 compression=self.options.get("compression", None),
1710 memory_map=self.options.get("memory_map", False),
1711 is_text=is_text,
1712 errors=self.options.get("encoding_errors", "strict"),
1713 storage_options=self.options.get("storage_options", None),
1714 )
1715 assert self.handles is not None
1716 f = self.handles.handle
File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:718, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
715 codecs.lookup_error(errors)
717 # open URLs
--> 718 ioargs = _get_filepath_or_buffer(
719 path_or_buf,
720 encoding=encoding,
721 compression=compression,
722 mode=mode,
723 storage_options=storage_options,
724 )
726 handle = ioargs.filepath_or_buffer
727 handles: list[BaseBuffer]
File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:372, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
370 # assuming storage_options is to be interpreted as headers
371 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 372 with urlopen(req_info) as req:
373 content_encoding = req.headers.get("Content-Encoding", None)
374 if content_encoding == "gzip":
375 # Override compression based on Content-Encoding header
File /opt/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:274, in urlopen(*args, **kwargs)
268 """
269 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
270 the stdlib.
271 """
272 import urllib.request
--> 274 return urllib.request.urlopen(*args, **kwargs)
File /opt/anaconda3/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
214 else:
215 opener = _opener
--> 216 return opener.open(url, data, timeout)
File /opt/anaconda3/lib/python3.11/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout)
523 for processor in self.process_response.get(protocol, []):
524 meth = getattr(processor, meth_name)
--> 525 response = meth(req, response)
527 return response
File /opt/anaconda3/lib/python3.11/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response)
631 # According to RFC 2616, "2xx" code indicates that the client's
632 # request was successfully received, understood, and accepted.
633 if not (200 <= code < 300):
--> 634 response = self.parent.error(
635 'http', request, response, code, msg, hdrs)
637 return response
File /opt/anaconda3/lib/python3.11/urllib/request.py:557, in OpenerDirector.error(self, proto, *args)
555 http_err = 0
556 args = (dict, proto, meth_name) + args
--> 557 result = self._call_chain(*args)
558 if result:
559 return result
File /opt/anaconda3/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
494 for handler in handlers:
495 func = getattr(handler, meth_name)
--> 496 result = func(*args)
497 if result is not None:
498 return result
File /opt/anaconda3/lib/python3.11/urllib/request.py:749, in HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
746 fp.read()
747 fp.close()
--> 749 return self.parent.open(new, timeout=req.timeout)
File /opt/anaconda3/lib/python3.11/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout)
523 for processor in self.process_response.get(protocol, []):
524 meth = getattr(processor, meth_name)
--> 525 response = meth(req, response)
527 return response
File /opt/anaconda3/lib/python3.11/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response)
631 # According to RFC 2616, "2xx" code indicates that the client's
632 # request was successfully received, understood, and accepted.
633 if not (200 <= code < 300):
--> 634 response = self.parent.error(
635 'http', request, response, code, msg, hdrs)
637 return response
File /opt/anaconda3/lib/python3.11/urllib/request.py:563, in OpenerDirector.error(self, proto, *args)
561 if http_err:
562 args = (dict, 'default', 'http_error_default') + orig_args
--> 563 return self._call_chain(*args)
File /opt/anaconda3/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
494 for handler in handlers:
495 func = getattr(handler, meth_name)
--> 496 result = func(*args)
497 if result is not None:
498 return result
File /opt/anaconda3/lib/python3.11/urllib/request.py:643, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs)
642 def http_error_default(self, req, fp, code, msg, hdrs):
--> 643 raise HTTPError(req.full_url, code, msg, hdrs, fp)
HTTPError: HTTP Error 404: Not Found
# checking out the df object
df
Legislation Number | URL | Congress | Title | Sponsor | Party of Sponsor | Date of Introduction | Committees | Latest Action | Latest Action Date | ... | Related Bill.211 | Related Bill.212 | Related Bill.213 | Latest Summary | Amends Bill | Date Offered | Date Submitted | Date Proposed | Amendment Text (Latest) | Amends Amendment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | H.R. 1112 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Ensuring Military Readiness Act of 2023 | Banks, Jim [Rep.-R-IN-3] | Republican | 2/21/23 | House - Armed Services | Referred to the House Committee on Armed Servi... | 2/21/23 | ... | NaN | NaN | NaN | <p><b>Ensuring Military Readiness Act of 2023... | NaN | NaN | NaN | NaN | NaN | NaN |
1 | S. 435 | https://www.congress.gov/bill/118th-congress/s... | 118th Congress (2023-2024) | Ensuring Military Readiness Act of 2023 | Rubio, Marco [Sen.-R-FL] | Republican | 2/15/23 | Senate - Armed Services | Read twice and referred to the Committee on Ar... | 2/15/23 | ... | NaN | NaN | NaN | <p><b>Ensuring Military Readiness Act of 2023... | NaN | NaN | NaN | NaN | NaN | NaN |
2 | H.Res. 886 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Supporting the goals and principles of Transge... | Jayapal, Pramila [Rep.-D-WA-7] | Democratic | 11/21/23 | House - Judiciary | Referred to the House Committee on the Judiciary. | 11/21/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for the ... | NaN | NaN | NaN | NaN | NaN | NaN |
3 | S.Res. 464 | https://www.congress.gov/bill/118th-congress/s... | 118th Congress (2023-2024) | A resolution supporting the goals and principl... | Hirono, Mazie K. [Sen.-D-HI] | Democratic | 11/15/23 | Senate - Judiciary | Star Print ordered on resolution. | 12/4/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for the ... | NaN | NaN | NaN | NaN | NaN | NaN |
4 | H.Res. 269 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Recognizing that it is the duty of the Federal... | Jayapal, Pramila [Rep.-D-WA-7] | Democratic | 3/30/23 | House - Judiciary, Education and the Workforce... | Sponsor introductory remarks on measure. (CR H... | 4/19/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for impl... | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
281 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | I welcome that discussion.\n Mr. Chair, I yie... | NaN |
282 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | biotechnology equipment or service produced or... | NaN |
283 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | United States assistance has \n been provi... | NaN |
284 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Director of National Intelligence, shall submi... | NaN |
285 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | demanded.\n A recorded vote was ordered.\n T... | NaN |
286 rows × 650 columns
What is a dataframe
object?#
tabular format
dictionary structure, with a particular syntax for accessing elements
key-value pairs
df[‘column’]
filipa = {
'name': ['filipa', 'da gama', 'calado'],
'age': 34,
'degree': 'literature',
'job': 'digital scholarship specialist',
}
type(filipa)
dict
filipa['name']
['filipa', 'da gama', 'calado']
filipa['age']
34
filipa['job']
'digital scholarship specialist'
# within a DF is a Series, a list type of object. Use list indexing to pull out items
filipa['name'][0]
'filipa'
filipa['name'][2]
'calado'
viewing data: info()
head()
tail()
#
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Columns: 650 entries, Legislation Number to Amends Amendment
dtypes: float64(4), object(646)
memory usage: 1.4+ MB
df.head()
Legislation Number | URL | Congress | Title | Sponsor | Party of Sponsor | Date of Introduction | Committees | Latest Action | Latest Action Date | ... | Related Bill.211 | Related Bill.212 | Related Bill.213 | Latest Summary | Amends Bill | Date Offered | Date Submitted | Date Proposed | Amendment Text (Latest) | Amends Amendment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | H.R. 1112 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Ensuring Military Readiness Act of 2023 | Banks, Jim [Rep.-R-IN-3] | Republican | 2/21/23 | House - Armed Services | Referred to the House Committee on Armed Servi... | 2/21/23 | ... | NaN | NaN | NaN | <p><b>Ensuring Military Readiness Act of 2023... | NaN | NaN | NaN | NaN | NaN | NaN |
1 | S. 435 | https://www.congress.gov/bill/118th-congress/s... | 118th Congress (2023-2024) | Ensuring Military Readiness Act of 2023 | Rubio, Marco [Sen.-R-FL] | Republican | 2/15/23 | Senate - Armed Services | Read twice and referred to the Committee on Ar... | 2/15/23 | ... | NaN | NaN | NaN | <p><b>Ensuring Military Readiness Act of 2023... | NaN | NaN | NaN | NaN | NaN | NaN |
2 | H.Res. 886 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Supporting the goals and principles of Transge... | Jayapal, Pramila [Rep.-D-WA-7] | Democratic | 11/21/23 | House - Judiciary | Referred to the House Committee on the Judiciary. | 11/21/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for the ... | NaN | NaN | NaN | NaN | NaN | NaN |
3 | S.Res. 464 | https://www.congress.gov/bill/118th-congress/s... | 118th Congress (2023-2024) | A resolution supporting the goals and principl... | Hirono, Mazie K. [Sen.-D-HI] | Democratic | 11/15/23 | Senate - Judiciary | Star Print ordered on resolution. | 12/4/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for the ... | NaN | NaN | NaN | NaN | NaN | NaN |
4 | H.Res. 269 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Recognizing that it is the duty of the Federal... | Jayapal, Pramila [Rep.-D-WA-7] | Democratic | 3/30/23 | House - Judiciary, Education and the Workforce... | Sponsor introductory remarks on measure. (CR H... | 4/19/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for impl... | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 650 columns
df.tail()
Legislation Number | URL | Congress | Title | Sponsor | Party of Sponsor | Date of Introduction | Committees | Latest Action | Latest Action Date | ... | Related Bill.211 | Related Bill.212 | Related Bill.213 | Latest Summary | Amends Bill | Date Offered | Date Submitted | Date Proposed | Amendment Text (Latest) | Amends Amendment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
281 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | I welcome that discussion.\n Mr. Chair, I yie... | NaN |
282 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | biotechnology equipment or service produced or... | NaN |
283 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | United States assistance has \n been provi... | NaN |
284 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Director of National Intelligence, shall submi... | NaN |
285 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | demanded.\n A recorded vote was ordered.\n T... | NaN |
5 rows × 650 columns
We can also explore columns using the dictionary syntax to access each column
df['Date of Introduction']
0 2/21/23
1 2/15/23
2 11/21/23
3 11/15/23
4 3/30/23
...
281 NaN
282 NaN
283 NaN
284 NaN
285 NaN
Name: Date of Introduction, Length: 286, dtype: object
cleaning rows & columns: dropna()
.notna()
drop()
#
# only drops rows containing all NaN: missing value
# this doesn't work for our dataset. Why?
df.dropna()
Legislation Number | URL | Congress | Title | Sponsor | Party of Sponsor | Date of Introduction | Committees | Latest Action | Latest Action Date | ... | Related Bill.211 | Related Bill.212 | Related Bill.213 | Latest Summary | Amends Bill | Date Offered | Date Submitted | Date Proposed | Amendment Text (Latest) | Amends Amendment |
---|
0 rows × 650 columns
# check the tail. There are still some values there.
df.tail()
Legislation Number | URL | Congress | Title | Sponsor | Party of Sponsor | Date of Introduction | Committees | Latest Action | Latest Action Date | ... | Related Bill.211 | Related Bill.212 | Related Bill.213 | Latest Summary | Amends Bill | Date Offered | Date Submitted | Date Proposed | Amendment Text (Latest) | Amends Amendment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
281 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | I welcome that discussion.\n Mr. Chair, I yie... | NaN |
282 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | biotechnology equipment or service produced or... | NaN |
283 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | United States assistance has \n been provi... | NaN |
284 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Director of National Intelligence, shall submi... | NaN |
285 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | demanded.\n A recorded vote was ordered.\n T... | NaN |
5 rows × 650 columns
# check df info. We still have the same number of entries (rows)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Columns: 650 entries, Legislation Number to Amends Amendment
dtypes: float64(4), object(646)
memory usage: 1.4+ MB
# try isolating a column, dropping a row if it has any NaN within
# uses syntax for accessing columns from DF
df = df[df['Legislation Number'].notna()]
# notice the line with "entries"
# we have 148 rows, but our index goes up to 280
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Columns: 650 entries, Legislation Number to Amends Amendment
dtypes: float64(4), object(646)
memory usage: 752.7+ KB
# what's in the columns?
# googled how to print out all of the column names
df.columns.values
array(['Legislation Number', 'URL', 'Congress', 'Title', 'Sponsor',
'Party of Sponsor', 'Date of Introduction', 'Committees',
'Latest Action', 'Latest Action Date', 'Cosponsor', 'Cosponsor.1',
'Cosponsor.2', 'Cosponsor.3', 'Cosponsor.4', 'Cosponsor.5',
'Cosponsor.6', 'Cosponsor.7', 'Cosponsor.8', 'Cosponsor.9',
'Cosponsor.10', 'Cosponsor.11', 'Cosponsor.12', 'Cosponsor.13',
'Cosponsor.14', 'Cosponsor.15', 'Cosponsor.16', 'Cosponsor.17',
'Cosponsor.18', 'Cosponsor.19', 'Cosponsor.20', 'Cosponsor.21',
'Cosponsor.22', 'Cosponsor.23', 'Cosponsor.24', 'Cosponsor.25',
'Cosponsor.26', 'Cosponsor.27', 'Cosponsor.28', 'Cosponsor.29',
'Cosponsor.30', 'Cosponsor.31', 'Cosponsor.32', 'Cosponsor.33',
'Cosponsor.34', 'Cosponsor.35', 'Cosponsor.36', 'Cosponsor.37',
'Cosponsor.38', 'Cosponsor.39', 'Cosponsor.40', 'Cosponsor.41',
'Cosponsor.42', 'Cosponsor.43', 'Cosponsor.44', 'Cosponsor.45',
'Cosponsor.46', 'Cosponsor.47', 'Cosponsor.48', 'Cosponsor.49',
'Cosponsor.50', 'Cosponsor.51', 'Cosponsor.52', 'Cosponsor.53',
'Cosponsor.54', 'Cosponsor.55', 'Cosponsor.56', 'Cosponsor.57',
'Cosponsor.58', 'Cosponsor.59', 'Cosponsor.60', 'Cosponsor.61',
'Cosponsor.62', 'Cosponsor.63', 'Cosponsor.64', 'Cosponsor.65',
'Cosponsor.66', 'Cosponsor.67', 'Cosponsor.68', 'Cosponsor.69',
'Cosponsor.70', 'Cosponsor.71', 'Cosponsor.72', 'Cosponsor.73',
'Cosponsor.74', 'Cosponsor.75', 'Cosponsor.76', 'Cosponsor.77',
'Cosponsor.78', 'Cosponsor.79', 'Cosponsor.80', 'Cosponsor.81',
'Cosponsor.82', 'Cosponsor.83', 'Cosponsor.84', 'Cosponsor.85',
'Cosponsor.86', 'Cosponsor.87', 'Cosponsor.88', 'Cosponsor.89',
'Cosponsor.90', 'Cosponsor.91', 'Cosponsor.92', 'Cosponsor.93',
'Cosponsor.94', 'Cosponsor.95', 'Cosponsor.96', 'Cosponsor.97',
'Cosponsor.98', 'Cosponsor.99', 'Cosponsor.100', 'Cosponsor.101',
'Cosponsor.102', 'Cosponsor.103', 'Cosponsor.104', 'Cosponsor.105',
'Cosponsor.106', 'Cosponsor.107', 'Cosponsor.108', 'Cosponsor.109',
'Cosponsor.110', 'Cosponsor.111', 'Cosponsor.112', 'Cosponsor.113',
'Cosponsor.114', 'Cosponsor.115', 'Cosponsor.116', 'Cosponsor.117',
'Cosponsor.118', 'Cosponsor.119', 'Cosponsor.120', 'Cosponsor.121',
'Cosponsor.122', 'Cosponsor.123', 'Cosponsor.124', 'Cosponsor.125',
'Cosponsor.126', 'Cosponsor.127', 'Cosponsor.128', 'Cosponsor.129',
'Cosponsor.130', 'Cosponsor.131', 'Cosponsor.132', 'Cosponsor.133',
'Cosponsor.134', 'Cosponsor.135', 'Cosponsor.136', 'Cosponsor.137',
'Cosponsor.138', 'Cosponsor.139', 'Cosponsor.140', 'Cosponsor.141',
'Cosponsor.142', 'Cosponsor.143', 'Cosponsor.144', 'Cosponsor.145',
'Cosponsor.146', 'Cosponsor.147', 'Cosponsor.148', 'Cosponsor.149',
'Cosponsor.150', 'Cosponsor.151', 'Cosponsor.152', 'Cosponsor.153',
'Cosponsor.154', 'Cosponsor.155', 'Cosponsor.156', 'Cosponsor.157',
'Cosponsor.158', 'Cosponsor.159', 'Cosponsor.160', 'Cosponsor.161',
'Cosponsor.162', 'Cosponsor.163', 'Cosponsor.164', 'Cosponsor.165',
'Cosponsor.166', 'Cosponsor.167', 'Cosponsor.168', 'Cosponsor.169',
'Cosponsor.170', 'Cosponsor.171', 'Cosponsor.172', 'Cosponsor.173',
'Cosponsor.174', 'Cosponsor.175', 'Cosponsor.176', 'Cosponsor.177',
'Cosponsor.178', 'Cosponsor.179', 'Cosponsor.180', 'Cosponsor.181',
'Cosponsor.182', 'Cosponsor.183', 'Cosponsor.184', 'Cosponsor.185',
'Cosponsor.186', 'Cosponsor.187', 'Cosponsor.188', 'Cosponsor.189',
'Cosponsor.190', 'Cosponsor.191', 'Cosponsor.192', 'Cosponsor.193',
'Cosponsor.194', 'Cosponsor.195', 'Cosponsor.196', 'Cosponsor.197',
'Cosponsor.198', 'Cosponsor.199', 'Cosponsor.200', 'Cosponsor.201',
'Cosponsor.202', 'Cosponsor.203', 'Cosponsor.204', 'Cosponsor.205',
'Cosponsor.206', 'Cosponsor.207', 'Cosponsor.208', 'Cosponsor.209',
'Cosponsor.210', 'Cosponsor.211', 'Cosponsor.212', 'Cosponsor.213',
'Cosponsor.214', 'Number of Cosponsors', 'Subject', 'Subject.1',
'Subject.2', 'Subject.3', 'Subject.4', 'Subject.5', 'Subject.6',
'Subject.7', 'Subject.8', 'Subject.9', 'Subject.10', 'Subject.11',
'Subject.12', 'Subject.13', 'Subject.14', 'Subject.15',
'Subject.16', 'Subject.17', 'Subject.18', 'Subject.19',
'Subject.20', 'Subject.21', 'Subject.22', 'Subject.23',
'Subject.24', 'Subject.25', 'Subject.26', 'Subject.27',
'Subject.28', 'Subject.29', 'Subject.30', 'Subject.31',
'Subject.32', 'Subject.33', 'Subject.34', 'Subject.35',
'Subject.36', 'Subject.37', 'Subject.38', 'Subject.39',
'Subject.40', 'Subject.41', 'Subject.42', 'Subject.43',
'Subject.44', 'Subject.45', 'Subject.46', 'Subject.47',
'Subject.48', 'Subject.49', 'Subject.50', 'Subject.51',
'Subject.52', 'Subject.53', 'Subject.54', 'Subject.55',
'Subject.56', 'Subject.57', 'Subject.58', 'Subject.59',
'Subject.60', 'Subject.61', 'Subject.62', 'Subject.63',
'Subject.64', 'Subject.65', 'Subject.66', 'Subject.67',
'Subject.68', 'Subject.69', 'Subject.70', 'Subject.71',
'Subject.72', 'Subject.73', 'Subject.74', 'Subject.75',
'Subject.76', 'Subject.77', 'Subject.78', 'Subject.79',
'Subject.80', 'Subject.81', 'Subject.82', 'Subject.83',
'Subject.84', 'Subject.85', 'Subject.86', 'Subject.87',
'Subject.88', 'Subject.89', 'Subject.90', 'Subject.91',
'Subject.92', 'Subject.93', 'Subject.94', 'Subject.95',
'Subject.96', 'Subject.97', 'Subject.98', 'Subject.99',
'Subject.100', 'Subject.101', 'Subject.102', 'Subject.103',
'Subject.104', 'Subject.105', 'Subject.106', 'Subject.107',
'Subject.108', 'Subject.109', 'Subject.110', 'Subject.111',
'Subject.112', 'Subject.113', 'Subject.114', 'Subject.115',
'Subject.116', 'Subject.117', 'Subject.118', 'Subject.119',
'Subject.120', 'Subject.121', 'Subject.122', 'Subject.123',
'Subject.124', 'Subject.125', 'Subject.126', 'Subject.127',
'Subject.128', 'Subject.129', 'Subject.130', 'Subject.131',
'Subject.132', 'Subject.133', 'Subject.134', 'Subject.135',
'Subject.136', 'Subject.137', 'Subject.138', 'Subject.139',
'Subject.140', 'Subject.141', 'Subject.142', 'Subject.143',
'Subject.144', 'Subject.145', 'Subject.146', 'Subject.147',
'Subject.148', 'Subject.149', 'Subject.150', 'Subject.151',
'Subject.152', 'Subject.153', 'Subject.154', 'Subject.155',
'Subject.156', 'Subject.157', 'Subject.158', 'Subject.159',
'Subject.160', 'Subject.161', 'Subject.162', 'Subject.163',
'Subject.164', 'Subject.165', 'Subject.166', 'Subject.167',
'Subject.168', 'Subject.169', 'Subject.170', 'Subject.171',
'Subject.172', 'Subject.173', 'Subject.174', 'Subject.175',
'Subject.176', 'Subject.177', 'Subject.178', 'Subject.179',
'Subject.180', 'Subject.181', 'Subject.182', 'Subject.183',
'Subject.184', 'Subject.185', 'Subject.186', 'Subject.187',
'Subject.188', 'Subject.189', 'Subject.190', 'Subject.191',
'Subject.192', 'Subject.193', 'Subject.194', 'Subject.195',
'Subject.196', 'Subject.197', 'Subject.198', 'Subject.199',
'Subject.200', 'Subject.201', 'Number of Related Bills',
'Related Bill', 'Related Bill.1', 'Related Bill.2',
'Related Bill.3', 'Related Bill.4', 'Related Bill.5',
'Related Bill.6', 'Related Bill.7', 'Related Bill.8',
'Related Bill.9', 'Related Bill.10', 'Related Bill.11',
'Related Bill.12', 'Related Bill.13', 'Related Bill.14',
'Related Bill.15', 'Related Bill.16', 'Related Bill.17',
'Related Bill.18', 'Related Bill.19', 'Related Bill.20',
'Related Bill.21', 'Related Bill.22', 'Related Bill.23',
'Related Bill.24', 'Related Bill.25', 'Related Bill.26',
'Related Bill.27', 'Related Bill.28', 'Related Bill.29',
'Related Bill.30', 'Related Bill.31', 'Related Bill.32',
'Related Bill.33', 'Related Bill.34', 'Related Bill.35',
'Related Bill.36', 'Related Bill.37', 'Related Bill.38',
'Related Bill.39', 'Related Bill.40', 'Related Bill.41',
'Related Bill.42', 'Related Bill.43', 'Related Bill.44',
'Related Bill.45', 'Related Bill.46', 'Related Bill.47',
'Related Bill.48', 'Related Bill.49', 'Related Bill.50',
'Related Bill.51', 'Related Bill.52', 'Related Bill.53',
'Related Bill.54', 'Related Bill.55', 'Related Bill.56',
'Related Bill.57', 'Related Bill.58', 'Related Bill.59',
'Related Bill.60', 'Related Bill.61', 'Related Bill.62',
'Related Bill.63', 'Related Bill.64', 'Related Bill.65',
'Related Bill.66', 'Related Bill.67', 'Related Bill.68',
'Related Bill.69', 'Related Bill.70', 'Related Bill.71',
'Related Bill.72', 'Related Bill.73', 'Related Bill.74',
'Related Bill.75', 'Related Bill.76', 'Related Bill.77',
'Related Bill.78', 'Related Bill.79', 'Related Bill.80',
'Related Bill.81', 'Related Bill.82', 'Related Bill.83',
'Related Bill.84', 'Related Bill.85', 'Related Bill.86',
'Related Bill.87', 'Related Bill.88', 'Related Bill.89',
'Related Bill.90', 'Related Bill.91', 'Related Bill.92',
'Related Bill.93', 'Related Bill.94', 'Related Bill.95',
'Related Bill.96', 'Related Bill.97', 'Related Bill.98',
'Related Bill.99', 'Related Bill.100', 'Related Bill.101',
'Related Bill.102', 'Related Bill.103', 'Related Bill.104',
'Related Bill.105', 'Related Bill.106', 'Related Bill.107',
'Related Bill.108', 'Related Bill.109', 'Related Bill.110',
'Related Bill.111', 'Related Bill.112', 'Related Bill.113',
'Related Bill.114', 'Related Bill.115', 'Related Bill.116',
'Related Bill.117', 'Related Bill.118', 'Related Bill.119',
'Related Bill.120', 'Related Bill.121', 'Related Bill.122',
'Related Bill.123', 'Related Bill.124', 'Related Bill.125',
'Related Bill.126', 'Related Bill.127', 'Related Bill.128',
'Related Bill.129', 'Related Bill.130', 'Related Bill.131',
'Related Bill.132', 'Related Bill.133', 'Related Bill.134',
'Related Bill.135', 'Related Bill.136', 'Related Bill.137',
'Related Bill.138', 'Related Bill.139', 'Related Bill.140',
'Related Bill.141', 'Related Bill.142', 'Related Bill.143',
'Related Bill.144', 'Related Bill.145', 'Related Bill.146',
'Related Bill.147', 'Related Bill.148', 'Related Bill.149',
'Related Bill.150', 'Related Bill.151', 'Related Bill.152',
'Related Bill.153', 'Related Bill.154', 'Related Bill.155',
'Related Bill.156', 'Related Bill.157', 'Related Bill.158',
'Related Bill.159', 'Related Bill.160', 'Related Bill.161',
'Related Bill.162', 'Related Bill.163', 'Related Bill.164',
'Related Bill.165', 'Related Bill.166', 'Related Bill.167',
'Related Bill.168', 'Related Bill.169', 'Related Bill.170',
'Related Bill.171', 'Related Bill.172', 'Related Bill.173',
'Related Bill.174', 'Related Bill.175', 'Related Bill.176',
'Related Bill.177', 'Related Bill.178', 'Related Bill.179',
'Related Bill.180', 'Related Bill.181', 'Related Bill.182',
'Related Bill.183', 'Related Bill.184', 'Related Bill.185',
'Related Bill.186', 'Related Bill.187', 'Related Bill.188',
'Related Bill.189', 'Related Bill.190', 'Related Bill.191',
'Related Bill.192', 'Related Bill.193', 'Related Bill.194',
'Related Bill.195', 'Related Bill.196', 'Related Bill.197',
'Related Bill.198', 'Related Bill.199', 'Related Bill.200',
'Related Bill.201', 'Related Bill.202', 'Related Bill.203',
'Related Bill.204', 'Related Bill.205', 'Related Bill.206',
'Related Bill.207', 'Related Bill.208', 'Related Bill.209',
'Related Bill.210', 'Related Bill.211', 'Related Bill.212',
'Related Bill.213', 'Latest Summary', 'Amends Bill',
'Date Offered', 'Date Submitted', 'Date Proposed',
'Amendment Text (Latest)', 'Amends Amendment'], dtype=object)
for item in df.columns.values:
if 'Cosponsor' in item:
# reassigning DF in each step of the loop
# axis = 1 means columns; axis = 0 means rows
df = df.drop(item, axis=1)
# We are down to about 400. let's do the rest
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Columns: 434 entries, Legislation Number to Amends Amendment
dtypes: float64(3), object(431)
memory usage: 503.0+ KB
# see there is no ellipses anymore on the column headers
df.head()
Legislation Number | URL | Congress | Title | Sponsor | Party of Sponsor | Date of Introduction | Committees | Latest Action | Latest Action Date | ... | Related Bill.211 | Related Bill.212 | Related Bill.213 | Latest Summary | Amends Bill | Date Offered | Date Submitted | Date Proposed | Amendment Text (Latest) | Amends Amendment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | H.R. 1112 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Ensuring Military Readiness Act of 2023 | Banks, Jim [Rep.-R-IN-3] | Republican | 2/21/23 | House - Armed Services | Referred to the House Committee on Armed Servi... | 2/21/23 | ... | NaN | NaN | NaN | <p><b>Ensuring Military Readiness Act of 2023... | NaN | NaN | NaN | NaN | NaN | NaN |
1 | S. 435 | https://www.congress.gov/bill/118th-congress/s... | 118th Congress (2023-2024) | Ensuring Military Readiness Act of 2023 | Rubio, Marco [Sen.-R-FL] | Republican | 2/15/23 | Senate - Armed Services | Read twice and referred to the Committee on Ar... | 2/15/23 | ... | NaN | NaN | NaN | <p><b>Ensuring Military Readiness Act of 2023... | NaN | NaN | NaN | NaN | NaN | NaN |
2 | H.Res. 886 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Supporting the goals and principles of Transge... | Jayapal, Pramila [Rep.-D-WA-7] | Democratic | 11/21/23 | House - Judiciary | Referred to the House Committee on the Judiciary. | 11/21/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for the ... | NaN | NaN | NaN | NaN | NaN | NaN |
3 | S.Res. 464 | https://www.congress.gov/bill/118th-congress/s... | 118th Congress (2023-2024) | A resolution supporting the goals and principl... | Hirono, Mazie K. [Sen.-D-HI] | Democratic | 11/15/23 | Senate - Judiciary | Star Print ordered on resolution. | 12/4/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for the ... | NaN | NaN | NaN | NaN | NaN | NaN |
4 | H.Res. 269 | https://www.congress.gov/bill/118th-congress/h... | 118th Congress (2023-2024) | Recognizing that it is the duty of the Federal... | Jayapal, Pramila [Rep.-D-WA-7] | Democratic | 3/30/23 | House - Judiciary, Education and the Workforce... | Sponsor introductory remarks on measure. (CR H... | 4/19/23 | ... | NaN | NaN | NaN | <p>This resolution expresses support for impl... | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 434 columns
How would we get rid of the rest of the columns that are mostly NaN?
for item in df.columns.values:
if 'Subject' in item:
df = df.drop(item, axis=1)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Columns: 232 entries, Legislation Number to Amends Amendment
dtypes: float64(3), object(229)
memory usage: 269.4+ KB
for item in df.columns.values:
if 'Related' in item:
df = df.drop(item, axis=1)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 0 to 280
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Legislation Number 148 non-null object
1 URL 148 non-null object
2 Congress 148 non-null object
3 Title 146 non-null object
4 Sponsor 148 non-null object
5 Party of Sponsor 148 non-null object
6 Date of Introduction 115 non-null object
7 Committees 114 non-null object
8 Latest Action 146 non-null object
9 Latest Action Date 146 non-null object
10 Latest Summary 61 non-null object
11 Amends Bill 33 non-null object
12 Date Offered 31 non-null object
13 Date Submitted 2 non-null object
14 Date Proposed 0 non-null float64
15 Amendment Text (Latest) 33 non-null object
16 Amends Amendment 0 non-null float64
dtypes: float64(2), object(15)
memory usage: 20.8+ KB
And now we have a much cleaner csv file. We can save it to our workspace with the following line:
df.to_csv('congress_clean.csv')