2010 Census U.S. Gazetteer file layout changed.

* weather.py(correlate): The United States Census Bureau altered the format of their 2010 Gazetteer on August 22, 2012, adding and reordering a few fields. The previous version of the parser assumed a fixed field order and ceased to work with the updated data files, so now the order is inferred from the column headings in the first line of each file instead.
2012-09-10 03:28:49 +00:00
parent 13eb635aca
commit e7256def4f
1 changed files with 44 additions and 23 deletions
--- a/weather.py
+++ b/weather.py
@@ -1328,15 +1328,21 @@ def correlate():
    sys.stdout.flush()
    count = 0
    gcounties = zipfile.ZipFile(gcounties_an).open(gcounties_fn, "rU")
+    columns = gcounties.readline().decode("latin1").strip().split("\t")
    for line in gcounties:
        fields = line.decode("latin1").strip().split("\t")
-        if len(fields) == 10 and fields[0] != "STUSPS":
-            fips = "fips%s" % fields[1]
-            description = "%s, %s" % ( fields[3], fields[0] )
-            centroid = gecos( ",".join( fields[8:10] ) )
+        f_geoid = fields[ columns.index("GEOID") ].strip()
+        f_name = fields[ columns.index("NAME") ].strip()
+        f_usps = fields[ columns.index("USPS") ].strip()
+        f_intptlat = fields[ columns.index("INTPTLAT") ].strip()
+        f_intptlong = fields[ columns.index("INTPTLONG") ].strip()
+        if f_geoid and f_name and f_usps and f_intptlat and f_intptlong:
+            fips = "fips%s" % f_geoid
            if fips not in places: places[fips] = {}
-            places[fips]["centroid"] = centroid
-            places[fips]["description"] = description
+            places[fips]["centroid"] = gecos(
+                "%s,%s" % (f_intptlat, f_intptlong)
+            )
+            places[fips]["description"] = "%s, %s" % (f_name, f_usps)
            count += 1
    gcounties.close()
    print("done (%s lines)." % count)
@@ -1345,15 +1351,21 @@ def correlate():
    sys.stdout.flush()
    count = 0
    gcousubs = zipfile.ZipFile(gcousubs_an).open(gcousubs_fn, "rU")
+    columns = gcousubs.readline().decode("latin1").strip().split("\t")
    for line in gcousubs:
        fields = line.decode("latin1").strip().split("\t")
-        if len(fields) == 10 and fields[0] != "STUSPS":
-            fips = "fips%s" % fields[1]
-            description = "%s, %s" % ( fields[3], fields[0] )
-            centroid = gecos( ",".join( fields[8:10] ) )
+        f_geoid = fields[ columns.index("GEOID") ].strip()
+        f_name = fields[ columns.index("NAME") ].strip()
+        f_usps = fields[ columns.index("USPS") ].strip()
+        f_intptlat = fields[ columns.index("INTPTLAT") ].strip()
+        f_intptlong = fields[ columns.index("INTPTLONG") ].strip()
+        if f_geoid and f_name and f_usps and f_intptlat and f_intptlong:
+            fips = "fips%s" % f_geoid
            if fips not in places: places[fips] = {}
-            places[fips]["centroid"] = centroid
-            places[fips]["description"] = description
+            places[fips]["centroid"] = gecos(
+                "%s,%s" % (f_intptlat, f_intptlong)
+            )
+            places[fips]["description"] = "%s, %s" % (f_name, f_usps)
            count += 1
    gcousubs.close()
    print("done (%s lines)." % count)
@@ -1362,15 +1374,21 @@ def correlate():
    sys.stdout.flush()
    count = 0
    gplaces = zipfile.ZipFile(gplaces_an).open(gplaces_fn, "rU")
+    columns = gplaces.readline().decode("latin1").strip().split("\t")
    for line in gplaces:
        fields = line.decode("latin1").strip().split("\t")
-        if len(fields) == 10 and fields[0] != "STUSPS":
-            fips = "fips%s" % fields[1]
-            description = "%s, %s" % ( fields[3], fields[0] )
-            centroid = gecos( ",".join( fields[8:10] ) )
+        f_geoid = fields[ columns.index("GEOID") ].strip()
+        f_name = fields[ columns.index("NAME") ].strip()
+        f_usps = fields[ columns.index("USPS") ].strip()
+        f_intptlat = fields[ columns.index("INTPTLAT") ].strip()
+        f_intptlong = fields[ columns.index("INTPTLONG") ].strip()
+        if f_geoid and f_name and f_usps and f_intptlat and f_intptlong:
+            fips = "fips%s" % f_geoid
            if fips not in places: places[fips] = {}
-            places[fips]["centroid"] = centroid
-            places[fips]["description"] = description
+            places[fips]["centroid"] = gecos(
+                "%s,%s" % (f_intptlat, f_intptlong)
+            )
+            places[fips]["description"] = "%s, %s" % (f_name, f_usps)
            count += 1
    gplaces.close()
    print("done (%s lines)." % count)
@@ -1562,13 +1580,16 @@ def correlate():
    sys.stdout.flush()
    count = 0
    gzcta = zipfile.ZipFile(gzcta_an).open(gzcta_fn, "rU")
+    columns = gzcta.readline().decode("latin1").strip().split("\t")
    for line in gzcta:
        fields = line.decode("latin1").strip().split("\t")
-        if len(fields) == 7 and fields[0] != "GEOID":
-            zcta = fields[0]
-            if zcta not in zctas: zctas[zcta] = {}
-            zctas[zcta]["centroid"] = gecos(
-                ",".join( ( fields[6], fields[5] ) )
+        f_geoid = fields[ columns.index("GEOID") ].strip()
+        f_intptlat = fields[ columns.index("INTPTLAT") ].strip()
+        f_intptlong = fields[ columns.index("INTPTLONG") ].strip()
+        if f_geoid and f_intptlat and f_intptlong:
+            if f_geoid not in zctas: zctas[f_geoid] = {}
+            zctas[f_geoid]["centroid"] = gecos(
+                "%s,%s" % (f_intptlat, f_intptlong)
            )
            count += 1
    gzcta.close()