diff --git a/datasets/irs_990/pipelines/irs_990_2014/irs_990_2014_dag.py b/datasets/irs_990/pipelines/irs_990_2014/irs_990_2014_dag.py index 480bae74d..ec90cf0a1 100644 --- a/datasets/irs_990/pipelines/irs_990_2014/irs_990_2014_dag.py +++ b/datasets/irs_990/pipelines/irs_990_2014/irs_990_2014_dag.py @@ -14,7 +14,7 @@ from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.operators import kubernetes_engine from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { @@ -32,14 +32,33 @@ catchup=False, default_view="graph", ) as dag: + create_cluster = kubernetes_engine.GKECreateClusterOperator( + task_id="create_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + body={ + "name": "pdp-irs-990-2014", + "initial_node_count": 1, + "network": "{{ var.value.vpc_network }}", + "node_config": { + "machine_type": "e2-standard-16", + "oauth_scopes": [ + "https://www.googleapis.com/auth/devstorage.read_write", + "https://www.googleapis.com/auth/cloud-platform", + ], + }, + }, + ) # Run CSV transform within kubernetes pod - irs_990_transform_csv = kubernetes_pod.KubernetesPodOperator( + irs_990_transform_csv = kubernetes_engine.GKEStartPodOperator( task_id="irs_990_transform_csv", startup_timeout_seconds=600, name="irs_990_2014", - service_account_name="datasets", - namespace="composer", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="pdp-irs-990-2014", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +71,17 @@ "CSV_HEADERS": '["ein","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"}', }, - resources={"request_memory": "4G", "request_cpu": "1"}, + container_resources={ + "memory": {"request": "16Gi"}, + "cpu": {"request": "1"}, + "ephemeral-storage": {"request": "10Gi"}, + }, + ) + delete_cluster = kubernetes_engine.GKEDeleteClusterOperator( + task_id="delete_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + name="pdp-irs-990-2014", ) # Task to load CSV data to a BigQuery table @@ -313,4 +342,4 @@ ], ) - irs_990_transform_csv >> load_irs_990_to_bq + create_cluster >> irs_990_transform_csv >> delete_cluster >> load_irs_990_to_bq diff --git a/datasets/irs_990/pipelines/irs_990_2014/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_2014/pipeline.yaml index 441bf97c0..fa9f23e48 100644 --- a/datasets/irs_990/pipelines/irs_990_2014/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_2014/pipeline.yaml @@ -38,30 +38,33 @@ dag: default_view: graph tasks: - - operator: "KubernetesPodOperator" + - operator: "GKECreateClusterOperator" + args: + task_id: "create_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + body: + name: pdp-irs-990-2014 + initial_node_count: 1 + network: "{{ var.value.vpc_network }}" + node_config: + machine_type: e2-standard-16 + oauth_scopes: + - https://www.googleapis.com/auth/devstorage.read_write + - https://www.googleapis.com/auth/cloud-platform - # Task description + - operator: "GKEStartPodOperator" description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_2014" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: pdp-irs-990-2014 image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/14eofinextract990.zip" SOURCE_FILE: "files/data.zip" @@ -73,37 +76,31 @@ dag: ["ein","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "4G" - request_cpu: "1" + container_resources: + memory: + request: "16Gi" + cpu: + request: "1" + ephemeral-storage: + request: "10Gi" + + - operator: "GKEDeleteClusterOperator" + args: + task_id: "delete_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + name: pdp-irs-990-2014 - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: task_id: "load_irs_990_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2014/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_2014" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" @@ -842,4 +839,4 @@ dag: mode: "nullable" graph_paths: - - "irs_990_transform_csv >> load_irs_990_to_bq" + - "create_cluster >> irs_990_transform_csv >> delete_cluster >> load_irs_990_to_bq" diff --git a/datasets/irs_990/pipelines/irs_990_2015/irs_990_2015_dag.py b/datasets/irs_990/pipelines/irs_990_2015/irs_990_2015_dag.py index 6d64bb9a4..b63e2cbe5 100644 --- a/datasets/irs_990/pipelines/irs_990_2015/irs_990_2015_dag.py +++ b/datasets/irs_990/pipelines/irs_990_2015/irs_990_2015_dag.py @@ -14,7 +14,7 @@ from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.operators import kubernetes_engine from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { @@ -32,14 +32,33 @@ catchup=False, default_view="graph", ) as dag: + create_cluster = kubernetes_engine.GKECreateClusterOperator( + task_id="create_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + body={ + "name": "pdp-irs-990-2015", + "initial_node_count": 1, + "network": "{{ var.value.vpc_network }}", + "node_config": { + "machine_type": "e2-standard-16", + "oauth_scopes": [ + "https://www.googleapis.com/auth/devstorage.read_write", + "https://www.googleapis.com/auth/cloud-platform", + ], + }, + }, + ) # Run CSV transform within kubernetes pod - irs_990_transform_csv = kubernetes_pod.KubernetesPodOperator( + irs_990_transform_csv = kubernetes_engine.GKEStartPodOperator( task_id="irs_990_transform_csv", startup_timeout_seconds=600, name="irs_990_2015", - service_account_name="datasets", - namespace="composer", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="pdp-irs-990-2015", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +71,17 @@ "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"}', }, - resources={"request_memory": "4G", "request_cpu": "1"}, + container_resources={ + "memory": {"request": "16Gi"}, + "cpu": {"request": "1"}, + "ephemeral-storage": {"request": "10Gi"}, + }, + ) + delete_cluster = kubernetes_engine.GKEDeleteClusterOperator( + task_id="delete_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + name="pdp-irs-990-2015", ) # Task to load CSV data to a BigQuery table @@ -314,4 +343,4 @@ ], ) - irs_990_transform_csv >> load_irs_990_to_bq + create_cluster >> irs_990_transform_csv >> delete_cluster >> load_irs_990_to_bq diff --git a/datasets/irs_990/pipelines/irs_990_2015/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_2015/pipeline.yaml index c44ec138b..b6dca82ee 100644 --- a/datasets/irs_990/pipelines/irs_990_2015/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_2015/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_2015 - - # Description of the table description: "IRS 990 2015 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_2015 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -38,30 +32,33 @@ dag: default_view: graph tasks: - - operator: "KubernetesPodOperator" + - operator: "GKECreateClusterOperator" + args: + task_id: "create_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + body: + name: pdp-irs-990-2015 + initial_node_count: 1 + network: "{{ var.value.vpc_network }}" + node_config: + machine_type: e2-standard-16 + oauth_scopes: + - https://www.googleapis.com/auth/devstorage.read_write + - https://www.googleapis.com/auth/cloud-platform - # Task description + - operator: "GKEStartPodOperator" description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_2015" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: pdp-irs-990-2015 image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/15eofinextract990.dat.dat" SOURCE_FILE: "files/data.dat" @@ -73,37 +70,31 @@ dag: ["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"} + container_resources: + memory: + request: "16Gi" + cpu: + request: "1" + ephemeral-storage: + request: "10Gi" - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "4G" - request_cpu: "1" + - operator: "GKEDeleteClusterOperator" + args: + task_id: "delete_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + name: pdp-irs-990-2015 - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: task_id: "load_irs_990_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2015/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_2015" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. schema_fields: - name: "ein" type: "string" @@ -845,4 +836,4 @@ dag: mode: "nullable" graph_paths: - - "irs_990_transform_csv >> load_irs_990_to_bq" + - "create_cluster >> irs_990_transform_csv >> delete_cluster >> load_irs_990_to_bq" diff --git a/datasets/irs_990/pipelines/irs_990_2016/irs_990_2016_dag.py b/datasets/irs_990/pipelines/irs_990_2016/irs_990_2016_dag.py index 5a616cc83..495deeb68 100644 --- a/datasets/irs_990/pipelines/irs_990_2016/irs_990_2016_dag.py +++ b/datasets/irs_990/pipelines/irs_990_2016/irs_990_2016_dag.py @@ -14,7 +14,7 @@ from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.operators import kubernetes_engine from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { @@ -32,14 +32,33 @@ catchup=False, default_view="graph", ) as dag: + create_cluster = kubernetes_engine.GKECreateClusterOperator( + task_id="create_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + body={ + "name": "pdp-irs-990-2016", + "initial_node_count": 1, + "network": "{{ var.value.vpc_network }}", + "node_config": { + "machine_type": "e2-standard-16", + "oauth_scopes": [ + "https://www.googleapis.com/auth/devstorage.read_write", + "https://www.googleapis.com/auth/cloud-platform", + ], + }, + }, + ) # Run CSV transform within kubernetes pod - irs_990_2016_transform_csv = kubernetes_pod.KubernetesPodOperator( + irs_990_2016_transform_csv = kubernetes_engine.GKEStartPodOperator( task_id="irs_990_2016_transform_csv", startup_timeout_seconds=600, name="irs_990_2016", - service_account_name="datasets", - namespace="composer", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="pdp-irs-990-2016", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +71,17 @@ "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"}', }, - resources={"request_memory": "4G", "request_cpu": "1"}, + container_resources={ + "memory": {"request": "16Gi"}, + "cpu": {"request": "1"}, + "ephemeral-storage": {"request": "10Gi"}, + }, + ) + delete_cluster = kubernetes_engine.GKEDeleteClusterOperator( + task_id="delete_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + name="pdp-irs-990-2016", ) # Task to load CSV data to a BigQuery table @@ -314,4 +343,9 @@ ], ) - irs_990_2016_transform_csv >> load_irs_990_2016_to_bq + ( + create_cluster + >> irs_990_2016_transform_csv + >> delete_cluster + >> load_irs_990_2016_to_bq + ) diff --git a/datasets/irs_990/pipelines/irs_990_2016/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_2016/pipeline.yaml index ae439e62f..d83d356f7 100644 --- a/datasets/irs_990/pipelines/irs_990_2016/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_2016/pipeline.yaml @@ -28,8 +28,6 @@ dag: dag_id: irs_990_2016 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -38,30 +36,33 @@ dag: default_view: graph tasks: - - operator: "KubernetesPodOperator" + - operator: "GKECreateClusterOperator" + args: + task_id: "create_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + body: + name: pdp-irs-990-2016 + initial_node_count: 1 + network: "{{ var.value.vpc_network }}" + node_config: + machine_type: e2-standard-16 + oauth_scopes: + - https://www.googleapis.com/auth/devstorage.read_write + - https://www.googleapis.com/auth/cloud-platform - # Task description + - operator: "GKEStartPodOperator" description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_2016_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_2016" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: pdp-irs-990-2016 image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/16eofinextract990.dat" SOURCE_FILE: "files/data.dat" @@ -73,36 +74,31 @@ dag: ["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "4G" - request_cpu: "1" + container_resources: + memory: + request: "16Gi" + cpu: + request: "1" + ephemeral-storage: + request: "10Gi" + + - operator: "GKEDeleteClusterOperator" + args: + task_id: "delete_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + name: pdp-irs-990-2016 - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: task_id: "load_irs_990_2016_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2016/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_2016" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. schema_fields: - name: "ein" type: "string" @@ -844,4 +840,4 @@ dag: mode: "nullable" graph_paths: - - "irs_990_2016_transform_csv >> load_irs_990_2016_to_bq" + - "create_cluster >> irs_990_2016_transform_csv >> delete_cluster >> load_irs_990_2016_to_bq" diff --git a/datasets/irs_990/pipelines/irs_990_2017/irs_990_2017_dag.py b/datasets/irs_990/pipelines/irs_990_2017/irs_990_2017_dag.py index 9bec53539..0ed07b787 100644 --- a/datasets/irs_990/pipelines/irs_990_2017/irs_990_2017_dag.py +++ b/datasets/irs_990/pipelines/irs_990_2017/irs_990_2017_dag.py @@ -14,7 +14,7 @@ from airflow import DAG -from airflow.providers.cncf.kubernetes.operators import kubernetes_pod +from airflow.providers.google.cloud.operators import kubernetes_engine from airflow.providers.google.cloud.transfers import gcs_to_bigquery default_args = { @@ -32,14 +32,33 @@ catchup=False, default_view="graph", ) as dag: + create_cluster = kubernetes_engine.GKECreateClusterOperator( + task_id="create_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + body={ + "name": "pdp-irs-990-2017", + "initial_node_count": 1, + "network": "{{ var.value.vpc_network }}", + "node_config": { + "machine_type": "e2-standard-16", + "oauth_scopes": [ + "https://www.googleapis.com/auth/devstorage.read_write", + "https://www.googleapis.com/auth/cloud-platform", + ], + }, + }, + ) # Run CSV transform within kubernetes pod - irs_990_2017_transform_csv = kubernetes_pod.KubernetesPodOperator( + irs_990_2017_transform_csv = kubernetes_engine.GKEStartPodOperator( task_id="irs_990_2017_transform_csv", startup_timeout_seconds=600, name="irs_990_2017", - service_account_name="datasets", - namespace="composer", + namespace="default", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + cluster_name="pdp-irs-990-2017", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +71,17 @@ "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"}', }, - resources={"request_memory": "4G", "request_cpu": "1"}, + container_resources={ + "memory": {"request": "16Gi"}, + "cpu": {"request": "1"}, + "ephemeral-storage": {"request": "10Gi"}, + }, + ) + delete_cluster = kubernetes_engine.GKEDeleteClusterOperator( + task_id="delete_cluster", + project_id="{{ var.value.gcp_project }}", + location="us-central1-c", + name="pdp-irs-990-2017", ) # Task to load CSV data to a BigQuery table @@ -314,4 +343,9 @@ ], ) - irs_990_2017_transform_csv >> load_irs_990_2017_to_bq + ( + create_cluster + >> irs_990_2017_transform_csv + >> delete_cluster + >> load_irs_990_2017_to_bq + ) diff --git a/datasets/irs_990/pipelines/irs_990_2017/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_2017/pipeline.yaml index f42a5f059..d87f3dcac 100644 --- a/datasets/irs_990/pipelines/irs_990_2017/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_2017/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_2017 - - # Description of the table description: "IRS 990 2017 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_2017 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -38,30 +32,33 @@ dag: default_view: graph tasks: - - operator: "KubernetesPodOperator" + - operator: "GKECreateClusterOperator" + args: + task_id: "create_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + body: + name: pdp-irs-990-2017 + initial_node_count: 1 + network: "{{ var.value.vpc_network }}" + node_config: + machine_type: e2-standard-16 + oauth_scopes: + - https://www.googleapis.com/auth/devstorage.read_write + - https://www.googleapis.com/auth/cloud-platform - # Task description + - operator: "GKEStartPodOperator" description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_2017_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_2017" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "default" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + cluster_name: pdp-irs-990-2017 image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/17eofinextract990.dat" SOURCE_FILE: "files/data.dat" @@ -73,37 +70,31 @@ dag: ["ein","elf","tax_pd","subseccd","s501c3or4947a1cd","schdbind","politicalactvtscd","lbbyingactvtscd","subjto6033cd","dnradvisedfundscd","prptyintrcvdcd","maintwrkofartcd","crcounselingqstncd","hldassetsintermpermcd","rptlndbldgeqptcd","rptinvstothsecd","rptinvstprgrelcd","rptothasstcd","rptothliabcd","sepcnsldtfinstmtcd","sepindaudfinstmtcd","inclinfinstmtcd","operateschools170cd","frgnofficecd","frgnrevexpnscd","frgngrntscd","frgnaggragrntscd","rptprofndrsngfeescd","rptincfnndrsngcd","rptincgamingcd","operatehosptlcd","hospaudfinstmtcd","rptgrntstogovtcd","rptgrntstoindvcd","rptyestocompnstncd","txexmptbndcd","invstproceedscd","maintescrwaccntcd","actonbehalfcd","engageexcessbnftcd","awarexcessbnftcd","loantofficercd","grantoofficercd","dirbusnreltdcd","fmlybusnreltdcd","servasofficercd","recvnoncashcd","recvartcd","ceaseoperationscd","sellorexchcd","ownsepentcd","reltdorgcd","intincntrlcd","orgtrnsfrcd","conduct5percentcd","compltschocd","f1096cnt","fw2gcnt","wthldngrulescd","noemplyeesw3cnt","filerqrdrtnscd","unrelbusinccd","filedf990tcd","frgnacctcd","prohibtdtxshltrcd","prtynotifyorgcd","filedf8886tcd","solicitcntrbcd","exprstmntcd","providegoodscd","notfydnrvalcd","filedf8282cd","f8282cnt","fndsrcvdcd","premiumspaidcd","filedf8899cd","filedf1098ccd","excbushldngscd","s4966distribcd","distribtodonorcd","initiationfees","grsrcptspublicuse","grsincmembers","grsincother","filedlieuf1041cd","txexmptint","qualhlthplncd","qualhlthreqmntn","qualhlthonhnd","rcvdpdtngcd","filedf720cd","totreprtabled","totcomprelatede","totestcompf","noindiv100kcnt","nocontractor100kcnt","totcntrbgfts","prgmservcode2acd","totrev2acola","prgmservcode2bcd","totrev2bcola","prgmservcode2ccd","totrev2ccola","prgmservcode2dcd","totrev2dcola","prgmservcode2ecd","totrev2ecola","totrev2fcola","totprgmrevnue","invstmntinc","txexmptbndsproceeds","royaltsinc","grsrntsreal","grsrntsprsnl","rntlexpnsreal","rntlexpnsprsnl","rntlincreal","rntlincprsnl","netrntlinc","grsalesecur","grsalesothr","cstbasisecur","cstbasisothr","gnlsecur","gnlsothr","netgnls","grsincfndrsng","lessdirfndrsng","netincfndrsng","grsincgaming","lessdirgaming","netincgaming","grsalesinvent","lesscstofgoods","netincsales","miscrev11acd","miscrevtota","miscrev11bcd","miscrevtot11b","miscrev11ccd","miscrevtot11c","miscrevtot11d","miscrevtot11e","totrevenue","grntstogovt","grnsttoindiv","grntstofrgngovt","benifitsmembrs","compnsatncurrofcr","compnsatnandothr","othrsalwages","pensionplancontrb","othremplyeebenef","payrolltx","feesforsrvcmgmt","legalfees","accntingfees","feesforsrvclobby","profndraising","feesforsrvcinvstmgmt","feesforsrvcothr","advrtpromo","officexpns","infotech","royaltsexpns","occupancy","travel","travelofpublicoffcl","converconventmtng","interestamt","pymtoaffiliates","deprcatndepletn","insurance","othrexpnsa","othrexpnsb","othrexpnsc","othrexpnsd","othrexpnse","othrexpnsf","totfuncexpns","nonintcashend","svngstempinvend","pldgegrntrcvblend","accntsrcvblend","currfrmrcvblend","rcvbldisqualend","notesloansrcvblend","invntriesalesend","prepaidexpnsend","lndbldgsequipend","invstmntsend","invstmntsothrend","invstmntsprgmend","intangibleassetsend","othrassetsend","totassetsend","accntspayableend","grntspayableend","deferedrevnuend","txexmptbndsend","escrwaccntliabend","paybletoffcrsend","secrdmrtgsend","unsecurednotesend","othrliabend","totliabend","unrstrctnetasstsend","temprstrctnetasstsend","permrstrctnetasstsend","capitalstktrstend","paidinsurplusend","retainedearnend","totnetassetend","totnetliabastend","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"elf": "elf","EIN": "ein","tax_prd": "tax_pd","subseccd": "subseccd","s50Yc3or4947aYcd": "s501c3or4947a1cd","schdbind": "schdbind","politicalactvtscd": "politicalactvtscd","lbbyingactvtscd": "lbbyingactvtscd","subjto6033cd": "subjto6033cd","dnradvisedfundscd": "dnradvisedfundscd","prptyintrcvdcd": "prptyintrcvdcd","maintwrkofartcd": "maintwrkofartcd","crcounselingqstncd": "crcounselingqstncd","hldassetsintermpermcd": "hldassetsintermpermcd","rptlndbldgeqptcd": "rptlndbldgeqptcd","rptinvstothsecd": "rptinvstothsecd","rptinvstprgrelcd": "rptinvstprgrelcd","rptothasstcd": "rptothasstcd","rptothliabcd": "rptothliabcd","sepcnsldtfinstmtcd": "sepcnsldtfinstmtcd","sepindaudfinstmtcd": "sepindaudfinstmtcd","inclinfinstmtcd": "inclinfinstmtcd","operateschoolsY70cd": "operateschools170cd","frgnofficecd": "frgnofficecd","frgnrevexpnscd": "frgnrevexpnscd","frgngrntscd": "frgngrntscd","frgnaggragrntscd": "frgnaggragrntscd","rptprofndrsngfeescd": "rptprofndrsngfeescd","rptincfnndrsngcd": "rptincfnndrsngcd","rptincgamingcd": "rptincgamingcd","operatehosptlcd": "operatehosptlcd","hospaudfinstmtcd": "hospaudfinstmtcd","rptgrntstogovtcd": "rptgrntstogovtcd","rptgrntstoindvcd": "rptgrntstoindvcd","rptyestocompnstncd": "rptyestocompnstncd","txexmptbndcd": "txexmptbndcd","invstproceedscd": "invstproceedscd","maintescrwaccntcd": "maintescrwaccntcd","actonbehalfcd": "actonbehalfcd","engageexcessbnftcd": "engageexcessbnftcd","awarexcessbnftcd": "awarexcessbnftcd","loantofficercd": "loantofficercd","grantoofficercd": "grantoofficercd","dirbusnreltdcd": "dirbusnreltdcd","fmlybusnreltdcd": "fmlybusnreltdcd","servasofficercd": "servasofficercd","recvnoncashcd": "recvnoncashcd","recvartcd": "recvartcd","ceaseoperationscd": "ceaseoperationscd","sellorexchcd": "sellorexchcd","ownsepentcd": "ownsepentcd","reltdorgcd": "reltdorgcd","intincntrlcd": "intincntrlcd","orgtrnsfrcd": "orgtrnsfrcd","conduct5percentcd": "conduct5percentcd","compltschocd": "compltschocd","f1096cnt": "f1096cnt","fw2gcnt": "fw2gcnt","wthldngrulescd": "wthldngrulescd","noemplyeesw3cnt": "noemplyeesw3cnt","filerqrdrtnscd": "filerqrdrtnscd","unrelbusinccd": "unrelbusinccd","filedf990tcd": "filedf990tcd","frgnacctcd": "frgnacctcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","prtynotifyorgcd": "prtynotifyorgcd","filedf8886tcd": "filedf8886tcd","solicitcntrbcd": "solicitcntrbcd","exprstmntcd": "exprstmntcd","providegoodscd": "providegoodscd","notfydnrvalcd": "notfydnrvalcd","filedf8N8Ncd": "filedf8282cd","f8282cnt": "f8282cnt","fndsrcvdcd": "fndsrcvdcd","premiumspaidcd": "premiumspaidcd","filedf8899cd": "filedf8899cd","filedfY098ccd": "filedf1098ccd","excbushldngscd": "excbushldngscd","s4966distribcd": "s4966distribcd","distribtodonorcd": "distribtodonorcd","initiationfees": "initiationfees","grsrcptspublicuse": "grsrcptspublicuse","grsincmembers": "grsincmembers","grsincother": "grsincother","filedlieufY04Ycd": "filedlieuf1041cd","txexmptint": "txexmptint","qualhlthplncd": "qualhlthplncd","qualhlthreqmntn": "qualhlthreqmntn","qualhlthonhnd": "qualhlthonhnd","rcvdpdtngcd": "rcvdpdtngcd","filedf7N0cd": "filedf720cd","totreprtabled": "totreprtabled","totcomprelatede": "totcomprelatede","totestcompf": "totestcompf","noindiv100kcnt": "noindiv100kcnt","nocontractor100kcnt": "nocontractor100kcnt","totcntrbgfts": "totcntrbgfts","prgmservcode2acd": "prgmservcode2acd","totrev2acola": "totrev2acola","prgmservcode2bcd": "prgmservcode2bcd","totrev2bcola": "totrev2bcola","prgmservcode2ccd": "prgmservcode2ccd","totrev2ccola": "totrev2ccola","prgmservcode2dcd": "prgmservcode2dcd","totrev2dcola": "totrev2dcola","prgmservcode2ecd": "prgmservcode2ecd","totrev2ecola": "totrev2ecola","totrev2fcola": "totrev2fcola","totprgmrevnue": "totprgmrevnue","invstmntinc": "invstmntinc","txexmptbndsproceeds": "txexmptbndsproceeds","royaltsinc": "royaltsinc","grsrntsreal": "grsrntsreal","grsrntsprsnl": "grsrntsprsnl","rntlexpnsreal": "rntlexpnsreal","rntlexpnsprsnl": "rntlexpnsprsnl","rntlincreal": "rntlincreal","rntlincprsnl": "rntlincprsnl","netrntlinc": "netrntlinc","grsalesecur": "grsalesecur","grsalesothr": "grsalesothr","cstbasisecur": "cstbasisecur","cstbasisothr": "cstbasisothr","gnlsecur": "gnlsecur","gnlsothr": "gnlsothr","netgnls": "netgnls","grsincfndrsng": "grsincfndrsng","lessdirfndrsng": "lessdirfndrsng","netincfndrsng": "netincfndrsng","grsincgaming": "grsincgaming","lessdirgaming": "lessdirgaming","netincgaming": "netincgaming","grsalesinvent": "grsalesinvent","lesscstofgoods": "lesscstofgoods","netincsales": "netincsales","miscrev11acd": "miscrev11acd","miscrevtota": "miscrevtota","miscrev11bcd": "miscrev11bcd","miscrevtot11b": "miscrevtot11b","miscrev11ccd": "miscrev11ccd","miscrevtot11c": "miscrevtot11c","miscrevtot11d": "miscrevtot11d","miscrevtot11e": "miscrevtot11e","totrevenue": "totrevenue","grntstogovt": "grntstogovt","grnsttoindiv": "grnsttoindiv","grntstofrgngovt": "grntstofrgngovt","benifitsmembrs": "benifitsmembrs","compnsatncurrofcr": "compnsatncurrofcr","compnsatnandothr": "compnsatnandothr","othrsalwages": "othrsalwages","pensionplancontrb": "pensionplancontrb","othremplyeebenef": "othremplyeebenef","payrolltx": "payrolltx","feesforsrvcmgmt": "feesforsrvcmgmt","legalfees": "legalfees","accntingfees": "accntingfees","feesforsrvclobby": "feesforsrvclobby","profndraising": "profndraising","feesforsrvcinvstmgmt": "feesforsrvcinvstmgmt","feesforsrvcothr": "feesforsrvcothr","advrtpromo": "advrtpromo","officexpns": "officexpns","infotech": "infotech","royaltsexpns": "royaltsexpns","occupancy": "occupancy","travel": "travel","travelofpublicoffcl": "travelofpublicoffcl","converconventmtng": "converconventmtng","interestamt": "interestamt","pymtoaffiliates": "pymtoaffiliates","deprcatndepletn": "deprcatndepletn","insurance": "insurance","othrexpnsa": "othrexpnsa","othrexpnsb": "othrexpnsb","othrexpnsc": "othrexpnsc","othrexpnsd": "othrexpnsd","othrexpnse": "othrexpnse","othrexpnsf": "othrexpnsf","totfuncexpns": "totfuncexpns","nonintcashend": "nonintcashend","svngstempinvend": "svngstempinvend","pldgegrntrcvblend": "pldgegrntrcvblend","accntsrcvblend": "accntsrcvblend","currfrmrcvblend": "currfrmrcvblend","rcvbldisqualend": "rcvbldisqualend","notesloansrcvblend": "notesloansrcvblend","invntriesalesend": "invntriesalesend","prepaidexpnsend": "prepaidexpnsend","lndbldgsequipend": "lndbldgsequipend","invstmntsend": "invstmntsend","invstmntsothrend": "invstmntsothrend","invstmntsprgmend": "invstmntsprgmend","intangibleassetsend": "intangibleassetsend","othrassetsend": "othrassetsend","totassetsend": "totassetsend","accntspayableend": "accntspayableend","grntspayableend": "grntspayableend","deferedrevnuend": "deferedrevnuend","txexmptbndsend": "txexmptbndsend","escrwaccntliabend": "escrwaccntliabend","paybletoffcrsend": "paybletoffcrsend","secrdmrtgsend": "secrdmrtgsend","unsecurednotesend": "unsecurednotesend","othrliabend": "othrliabend","totliabend": "totliabend","unrstrctnetasstsend": "unrstrctnetasstsend","temprstrctnetasstsend": "temprstrctnetasstsend","permrstrctnetasstsend": "permrstrctnetasstsend","capitalstktrstend": "capitalstktrstend","paidinsurplusend": "paidinsurplusend","retainedearnend": "retainedearnend","totnetassetend": "totnetassetend","totnetliabastend": "totnetliabastend","nonpfrea": "nonpfrea","totnooforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntsrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","exceeds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunreltd170": "netincunreltd170","othrinc170": "othrinc170","totsupp170": "totsupp170","grsrcptsrelated170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmissn509": "grsrcptsadmissn509","grsrcptsactivities509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","exceeds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunrelatd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "4G" - request_cpu: "1" + container_resources: + memory: + request: "16Gi" + cpu: + request: "1" + ephemeral-storage: + request: "10Gi" + + - operator: "GKEDeleteClusterOperator" + args: + task_id: "delete_cluster" + project_id: "{{ var.value.gcp_project }}" + location: "us-central1-c" + name: pdp-irs-990-2017 - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: task_id: "load_irs_990_2017_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_2017/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_2017" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" @@ -845,4 +836,4 @@ dag: mode: "nullable" graph_paths: - - "irs_990_2017_transform_csv >> load_irs_990_2017_to_bq" + - "create_cluster >> irs_990_2017_transform_csv >> delete_cluster >> load_irs_990_2017_to_bq" diff --git a/datasets/irs_990/pipelines/irs_990_ez_2014/irs_990_ez_2014_dag.py b/datasets/irs_990/pipelines/irs_990_ez_2014/irs_990_ez_2014_dag.py index 650df7880..e86675658 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2014/irs_990_ez_2014_dag.py +++ b/datasets/irs_990/pipelines/irs_990_ez_2014/irs_990_ez_2014_dag.py @@ -38,8 +38,9 @@ task_id="irs_990_ez_2014_transform_csv", startup_timeout_seconds=600, name="irs_990_ez_2014", - service_account_name="datasets", - namespace="composer", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +53,6 @@ "CSV_HEADERS": '["ein","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"}', }, - resources={"request_memory": "2G", "request_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/irs_990/pipelines/irs_990_ez_2014/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_ez_2014/pipeline.yaml index ed8ee23bd..46e43840a 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2014/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_ez_2014/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_ez_2014 - - # Description of the table description: "IRS 990 EZ 2014 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_ez_2014 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -39,29 +33,16 @@ dag: tasks: - operator: "KubernetesPodOperator" - - # Task description description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_ez_2014_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_ez_2014" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/14eofinextract990ez.zip" SOURCE_FILE: "files/data.dat" @@ -73,38 +54,17 @@ dag: ["ein","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: - task_id: "load_irs_990_ez_2014_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2014/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_ez_2014" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" diff --git a/datasets/irs_990/pipelines/irs_990_ez_2015/irs_990_ez_2015_dag.py b/datasets/irs_990/pipelines/irs_990_ez_2015/irs_990_ez_2015_dag.py index 350e1bd72..e77a51446 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2015/irs_990_ez_2015_dag.py +++ b/datasets/irs_990/pipelines/irs_990_ez_2015/irs_990_ez_2015_dag.py @@ -38,8 +38,9 @@ task_id="irs_990_ez_2015_transform_csv", startup_timeout_seconds=600, name="irs_990_ez_2015", - service_account_name="datasets", - namespace="composer", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +53,6 @@ "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"}', }, - resources={"request_memory": "2G", "request_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/irs_990/pipelines/irs_990_ez_2015/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_ez_2015/pipeline.yaml index 3c0476290..9b3d3a29f 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2015/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_ez_2015/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_ez_2015 - - # Description of the table description: "IRS 990 EZ 2015 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_ez_2015 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -39,29 +33,16 @@ dag: tasks: - operator: "KubernetesPodOperator" - - # Task description description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_ez_2015_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_ez_2015" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/15eofinextractEZ.dat" SOURCE_FILE: "files/data.dat" @@ -73,38 +54,17 @@ dag: ["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: - task_id: "load_irs_990_ez_2015_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2015/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_ez_2015" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" diff --git a/datasets/irs_990/pipelines/irs_990_ez_2016/irs_990_ez_2016_dag.py b/datasets/irs_990/pipelines/irs_990_ez_2016/irs_990_ez_2016_dag.py index 7fa5e1360..84394646f 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2016/irs_990_ez_2016_dag.py +++ b/datasets/irs_990/pipelines/irs_990_ez_2016/irs_990_ez_2016_dag.py @@ -38,8 +38,9 @@ task_id="irs_990_ez_2016_transform_csv", startup_timeout_seconds=600, name="irs_990_ez_2016", - service_account_name="datasets", - namespace="composer", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +53,6 @@ "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"}', }, - resources={"request_memory": "2G", "request_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/irs_990/pipelines/irs_990_ez_2016/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_ez_2016/pipeline.yaml index 590bc9ac4..1ddbf52ff 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2016/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_ez_2016/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_ez_2016 - - # Description of the table description: "IRS 990 EZ 2016 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_ez_2016 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -39,29 +33,16 @@ dag: tasks: - operator: "KubernetesPodOperator" - - # Task description description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_ez_2016_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_ez_2016" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/16eofinextractez.dat" SOURCE_FILE: "files/data.dat" @@ -73,38 +54,17 @@ dag: ["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: - task_id: "load_irs_990_ez_2016_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2016/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_ez_2016" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" diff --git a/datasets/irs_990/pipelines/irs_990_ez_2017/irs_990_ez_2017_dag.py b/datasets/irs_990/pipelines/irs_990_ez_2017/irs_990_ez_2017_dag.py index 25159681d..a08a15b96 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2017/irs_990_ez_2017_dag.py +++ b/datasets/irs_990/pipelines/irs_990_ez_2017/irs_990_ez_2017_dag.py @@ -38,8 +38,9 @@ task_id="irs_990_ez_2017_transform_csv", startup_timeout_seconds=600, name="irs_990_ez_2017", - service_account_name="datasets", - namespace="composer", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +53,6 @@ "CSV_HEADERS": '["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"]', "RENAME_MAPPINGS": '{"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"}', }, - resources={"request_memory": "2G", "request_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/irs_990/pipelines/irs_990_ez_2017/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_ez_2017/pipeline.yaml index bb822a7ee..59f1bf6eb 100644 --- a/datasets/irs_990/pipelines/irs_990_ez_2017/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_ez_2017/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_ez_2017 - - # Description of the table description: "IRS 990 EZ 2017 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_ez_2017 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -39,29 +33,16 @@ dag: tasks: - operator: "KubernetesPodOperator" - - # Task description description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_ez_2017_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_ez_2017" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/17eofinextractEZ.dat" SOURCE_FILE: "files/data.dat" @@ -73,38 +54,17 @@ dag: ["ein","elf","tax_pd","subseccd","totcntrbs","prgmservrev","duesassesmnts","othrinvstinc","grsamtsalesastothr","basisalesexpnsothr","gnsaleofastothr","grsincgaming","grsrevnuefndrsng","direxpns","netincfndrsng","grsalesminusret","costgoodsold","grsprft","othrevnue","totrevnue","totexpns","totexcessyr","othrchgsnetassetfnd","networthend","totassetsend","totliabend","totnetassetsend","actvtynotprevrptcd","chngsinorgcd","unrelbusincd","filedf990tcd","contractioncd","politicalexpend","filedf1120polcd","loanstoofficerscd","loanstoofficers","initiationfee","grspublicrcpts","s4958excessbenefcd","prohibtdtxshltrcd","nonpfrea","totnooforgscnt","totsupport","gftgrntsrcvd170","txrevnuelevied170","srvcsval170","pubsuppsubtot170","exceeds2pct170","pubsupplesspct170","samepubsuppsubtot170","grsinc170","netincunreltd170","othrinc170","totsupp170","grsrcptsrelated170","totgftgrntrcvd509","grsrcptsadmissn509","grsrcptsactivities509","txrevnuelevied509","srvcsval509","pubsuppsubtot509","rcvdfrmdisqualsub509","exceeds1pct509","subtotpub509","pubsupplesub509","samepubsuppsubtot509","grsinc509","unreltxincls511tx509","subtotsuppinc509","netincunrelatd509","othrinc509","totsupp509"] RENAME_MAPPINGS: >- {"EIN": "ein","a_tax_prd": "tax_pd","taxpd": "tax_pd","taxprd": "tax_pd","subseccd": "subseccd","prgmservrev": "prgmservrev","duesassesmnts": "duesassesmnts","othrinvstinc": "othrinvstinc","grsamtsalesastothr": "grsamtsalesastothr","basisalesexpnsothr": "basisalesexpnsothr","gnsaleofastothr": "gnsaleofastothr","grsincgaming": "grsincgaming","grsrevnuefndrsng": "grsrevnuefndrsng","direxpns": "direxpns","netincfndrsng": "netincfndrsng","grsalesminusret": "grsalesminusret","costgoodsold": "costgoodsold","grsprft": "grsprft","othrevnue": "othrevnue","totrevnue": "totrevnue","totexpns": "totexpns","totexcessyr": "totexcessyr","othrchgsnetassetfnd": "othrchgsnetassetfnd","networthend": "networthend","totassetsend": "totassetsend","totliabend": "totliabend","totnetassetsend": "totnetassetsend","actvtynotprevrptcd": "actvtynotprevrptcd","chngsinorgcd": "chngsinorgcd","unrelbusincd": "unrelbusincd","filedf990tcd": "filedf990tcd","contractioncd": "contractioncd","politicalexpend": "politicalexpend","filedfYYN0polcd": "filedf1120polcd","loanstoofficerscd": "loanstoofficerscd","loanstoofficers": "loanstoofficers","initiationfee": "initiationfee","grspublicrcpts": "grspublicrcpts","s4958excessbenefcd": "s4958excessbenefcd","prohibtdtxshltrcd": "prohibtdtxshltrcd","nonpfrea": "nonpfrea","totnoforgscnt": "totnooforgscnt","totsupport": "totsupport","gftgrntrcvd170": "gftgrntsrcvd170","txrevnuelevied170": "txrevnuelevied170","srvcsval170": "srvcsval170","pubsuppsubtot170": "pubsuppsubtot170","excds2pct170": "exceeds2pct170","pubsupplesspct170": "pubsupplesspct170","samepubsuppsubtot170": "samepubsuppsubtot170","grsinc170": "grsinc170","netincunrelatd170": "netincunreltd170","othrinc170": "othrinc170","totsupport170": "totsupp170","grsrcptsrelatd170": "grsrcptsrelated170","totgftgrntrcvd509": "totgftgrntrcvd509","grsrcptsadmiss509": "grsrcptsadmissn509","grsrcptsactvts509": "grsrcptsactivities509","txrevnuelevied509": "txrevnuelevied509","srvcsval509": "srvcsval509","pubsuppsubtot509": "pubsuppsubtot509","rcvdfrmdisqualsub509": "rcvdfrmdisqualsub509","excds1pct509": "exceeds1pct509","subtotpub509": "subtotpub509","pubsupplesssub509": "pubsupplesub509","samepubsuppsubtot509": "samepubsuppsubtot509","grsinc509": "grsinc509","unreltxincls511tx509": "unreltxincls511tx509","subtotsuppinc509": "subtotsuppinc509","netincunreltd509": "netincunrelatd509","othrinc509": "othrinc509","totsupp509": "totsupp509","elf": "elf","totcntrbs": "totcntrbs"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: - task_id: "load_irs_990_ez_2017_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_ez_2017/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_ez_2017" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" diff --git a/datasets/irs_990/pipelines/irs_990_pf_2014/irs_990_pf_2014_dag.py b/datasets/irs_990/pipelines/irs_990_pf_2014/irs_990_pf_2014_dag.py index f78a05796..4f2f6d187 100644 --- a/datasets/irs_990/pipelines/irs_990_pf_2014/irs_990_pf_2014_dag.py +++ b/datasets/irs_990/pipelines/irs_990_pf_2014/irs_990_pf_2014_dag.py @@ -38,8 +38,9 @@ task_id="irs_990_pf_2014_transform_csv", startup_timeout_seconds=600, name="irs_990_pf_2014", - service_account_name="datasets", - namespace="composer", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +53,6 @@ "CSV_HEADERS": '["ein","tax_prd","eostatus","tax_yr","operatingcd","subcd","fairmrktvalamt","grscontrgifts","schedbind","intrstrvnue","dividndsamt","grsrents","grsslspramt","costsold","grsprofitbus","otherincamt","totrcptperbks","compofficers","pensplemplbenf","legalfeesamt","accountingfees","interestamt","depreciationamt","occupancyamt","travlconfmtngs","printingpubl","topradmnexpnsa","contrpdpbks","totexpnspbks","excessrcpts","totrcptnetinc","topradmnexpnsb","totexpnsnetinc","netinvstinc","trcptadjnetinc","totexpnsadjnet","adjnetinc","topradmnexpnsd","totexpnsexempt","othrcashamt","invstgovtoblig","invstcorpstk","invstcorpbnd","totinvstsec","mrtgloans","othrinvstend","othrassetseoy","totassetsend","mrtgnotespay","othrliabltseoy","totliabend","tfundnworth","fairmrktvaleoy","totexcapgnls","totexcapgn","totexcapls","invstexcisetx","sec4940notxcd","sec4940redtxcd","sect511tx","subtitleatx","totaxpyr","esttaxcr","txwithldsrc","txpaidf2758","erronbkupwthld","estpnlty","taxdue","overpay","crelamt","infleg","actnotpr","chgnprvrptcd","filedf990tcd","contractncd","furnishcpycd","claimstatcd","cntrbtrstxyrcd","acqdrindrintcd","orgcmplypubcd","filedlf1041ind","propexchcd","brwlndmnycd","furngoodscd","paidcmpncd","transfercd","agremkpaycd","exceptactsind","prioractvcd","undistrinccd","applyprovind","dirindirintcd","excesshldcd","invstjexmptcd","prevjexmptcd","propgndacd","ipubelectcd","grntindivcd","nchrtygrntcd","nreligiouscd","excptransind","rfprsnlbnftind","pyprsnlbnftind","tfairmrktunuse","valncharitassets","cmpmininvstret","distribamt","undistribincyr","adjnetinccola","adjnetinccolb","adjnetinccolc","adjnetinccold","adjnetinctot","qlfydistriba","qlfydistribb","qlfydistribc","qlfydistribd","qlfydistribtot","valassetscola","valassetscolb","valassetscolc","valassetscold","valassetstot","qlfyasseta","qlfyassetb","qlfyassetc","qlfyassetd","qlfyassettot","endwmntscola","endwmntscolb","endwmntscolc","endwmntscold","endwmntstot","totsuprtcola","totsuprtcolb","totsuprtcolc","totsuprtcold","totsuprttot","pubsuprtcola","pubsuprtcolb","pubsuprtcolc","pubsuprtcold","pubsuprttot","grsinvstinca","grsinvstincb","grsinvstincc","grsinvstincd","grsinvstinctot","grntapprvfut","progsrvcacold","progsrvcacole","progsrvcbcold","progsrvcbcole","progsrvcccold","progsrvcccole","progsrvcdcold","progsrvcdcole","progsrvcecold","progsrvcecole","progsrvcfcold","progsrvcfcole","progsrvcgcold","progsrvcgcole","membershpduesd","membershpduese","intonsvngsd","intonsvngse","dvdndsintd","dvdndsinte","trnsfrcashcd","trnsothasstscd","salesasstscd","prchsasstscd","rentlsfacltscd","reimbrsmntscd","loansguarcd","perfservicescd","sharngasstscd"]', "RENAME_MAPPINGS": '{"EIN": "ein","TAX_PRD": "tax_prd","EOSTATUS": "eostatus","TAX_YR": "tax_yr","OPERATINGCD": "operatingcd","SUBCD": "subcd","FAIRMRKTVALAMT": "fairmrktvalamt","GRSCONTRGIFTS": "grscontrgifts","SCHEDBIND": "schedbind","INTRSTRVNUE": "intrstrvnue","DIVIDNDSAMT": "dividndsamt","GRSRENTS": "grsrents","GRSSLSPRAMT": "grsslspramt","COSTSOLD": "costsold","GRSPROFITBUS": "grsprofitbus","OTHERINCAMT": "otherincamt","TOTRCPTPERBKS": "totrcptperbks","COMPOFFICERS": "compofficers","PENSPLEMPLBENF": "pensplemplbenf","LEGALFEESAMT": "legalfeesamt","ACCOUNTINGFEES": "accountingfees","INTERESTAMT": "interestamt","DEPRECIATIONAMT": "depreciationamt","OCCUPANCYAMT": "occupancyamt","TRAVLCONFMTNGS": "travlconfmtngs","PRINTINGPUBL": "printingpubl","TOPRADMNEXPNSA": "topradmnexpnsa","CONTRPDPBKS": "contrpdpbks","TOTEXPNSPBKS": "totexpnspbks","EXCESSRCPTS": "excessrcpts","TOTRCPTNETINC": "totrcptnetinc","TOPRADMNEXPNSB": "topradmnexpnsb","TOTEXPNSNETINC": "totexpnsnetinc","NETINVSTINC": "netinvstinc","TRCPTADJNETINC": "trcptadjnetinc","TOTEXPNSADJNET": "totexpnsadjnet","ADJNETINC": "adjnetinc","TOPRADMNEXPNSD": "topradmnexpnsd","TOTEXPNSEXEMPT": "totexpnsexempt","OTHRCASHAMT": "othrcashamt","INVSTGOVTOBLIG": "invstgovtoblig","INVSTCORPSTK": "invstcorpstk","INVSTCORPBND": "invstcorpbnd","TOTINVSTSEC": "totinvstsec","MRTGLOANS": "mrtgloans","OTHRINVSTEND": "othrinvstend","OTHRASSETSEOY": "othrassetseoy","TOTASSETSEND": "totassetsend","MRTGNOTESPAY": "mrtgnotespay","OTHRLIABLTSEOY": "othrliabltseoy","TOTLIABEND": "totliabend","TFUNDNWORTH": "tfundnworth","FAIRMRKTVALEOY": "fairmrktvaleoy","TOTEXCAPGNLS": "totexcapgnls","TOTEXCAPGN": "totexcapgn","TOTEXCAPLS": "totexcapls","INVSTEXCISETX": "invstexcisetx","SEC4940NOTXCD": "sec4940notxcd","SEC4940REDTXCD": "sec4940redtxcd","SECT511TX": "sect511tx","SUBTITLEATX": "subtitleatx","TOTAXPYR": "totaxpyr","ESTTAXCR": "esttaxcr","TXWITHLDSRC": "txwithldsrc","TXPAIDF2758": "txpaidf2758","ERRONBKUPWTHLD": "erronbkupwthld","ESTPNLTY": "estpnlty","TAXDUE": "taxdue","OVERPAY": "overpay","CRELAMT": "crelamt","INFLEG": "infleg","ACTNOTPR": "actnotpr","CHGNPRVRPTCD": "chgnprvrptcd","FILEDF990TCD": "filedf990tcd","CONTRACTNCD": "contractncd","FURNISHCPYCD": "furnishcpycd","CLAIMSTATCD": "claimstatcd","CNTRBTRSTXYRCD": "cntrbtrstxyrcd","ACQDRINDRINTCD": "acqdrindrintcd","ORGCMPLYPUBCD": "orgcmplypubcd","FILEDLF1041IND": "filedlf1041ind","PROPEXCHCD": "propexchcd","BRWLNDMNYCD": "brwlndmnycd","FURNGOODSCD": "furngoodscd","PAIDCMPNCD": "paidcmpncd","TRANSFERCD": "transfercd","AGREMKPAYCD": "agremkpaycd","EXCEPTACTSIND": "exceptactsind","PRIORACTVCD": "prioractvcd","UNDISTRINCCD": "undistrinccd","APPLYPROVIND": "applyprovind","DIRINDIRINTCD": "dirindirintcd","EXCESSHLDCD": "excesshldcd","INVSTJEXMPTCD": "invstjexmptcd","PREVJEXMPTCD": "prevjexmptcd","PROPGNDACD": "propgndacd","IPUBELECTCD": "ipubelectcd","GRNTINDIVCD": "grntindivcd","NCHRTYGRNTCD": "nchrtygrntcd","NRELIGIOUSCD": "nreligiouscd","EXCPTRANSIND": "excptransind","RFPRSNLBNFTIND": "rfprsnlbnftind","PYPRSNLBNFTIND": "pyprsnlbnftind","TFAIRMRKTUNUSE": "tfairmrktunuse","VALNCHARITASSETS": "valncharitassets","CMPMININVSTRET": "cmpmininvstret","DISTRIBAMT": "distribamt","UNDISTRIBINCYR": "undistribincyr","ADJNETINCCOLA": "adjnetinccola","ADJNETINCCOLB": "adjnetinccolb","ADJNETINCCOLC": "adjnetinccolc","ADJNETINCCOLD": "adjnetinccold","ADJNETINCTOT": "adjnetinctot","QLFYDISTRIBA": "qlfydistriba","QLFYDISTRIBB": "qlfydistribb","QLFYDISTRIBC": "qlfydistribc","QLFYDISTRIBD": "qlfydistribd","QLFYDISTRIBTOT": "qlfydistribtot","VALASSETSCOLA": "valassetscola","VALASSETSCOLB": "valassetscolb","VALASSETSCOLC": "valassetscolc","VALASSETSCOLD": "valassetscold","VALASSETSTOT": "valassetstot","QLFYASSETA": "qlfyasseta","QLFYASSETB": "qlfyassetb","QLFYASSETC": "qlfyassetc","QLFYASSETD": "qlfyassetd","QLFYASSETTOT": "qlfyassettot","ENDWMNTSCOLA": "endwmntscola","ENDWMNTSCOLB": "endwmntscolb","ENDWMNTSCOLC": "endwmntscolc","ENDWMNTSCOLD": "endwmntscold","ENDWMNTSTOT": "endwmntstot","TOTSUPRTCOLA": "totsuprtcola","TOTSUPRTCOLB": "totsuprtcolb","TOTSUPRTCOLC": "totsuprtcolc","TOTSUPRTCOLD": "totsuprtcold","TOTSUPRTTOT": "totsuprttot","PUBSUPRTCOLA": "pubsuprtcola","PUBSUPRTCOLB": "pubsuprtcolb","PUBSUPRTCOLC": "pubsuprtcolc","PUBSUPRTCOLD": "pubsuprtcold","PUBSUPRTTOT": "pubsuprttot","GRSINVSTINCA": "grsinvstinca","GRSINVSTINCB": "grsinvstincb","GRSINVSTINCC": "grsinvstincc","GRSINVSTINCD": "grsinvstincd","GRSINVSTINCTOT": "grsinvstinctot","GRNTAPPRVFUT": "grntapprvfut","PROGSRVCACOLD": "progsrvcacold","PROGSRVCACOLE": "progsrvcacole","PROGSRVCBCOLD": "progsrvcbcold","PROGSRVCBCOLE": "progsrvcbcole","PROGSRVCCCOLD": "progsrvcccold","PROGSRVCCCOLE": "progsrvcccole","PROGSRVCDCOLD": "progsrvcdcold","PROGSRVCDCOLE": "progsrvcdcole","PROGSRVCECOLD": "progsrvcecold","PROGSRVCECOLE": "progsrvcecole","PROGSRVCFCOLD": "progsrvcfcold","PROGSRVCFCOLE": "progsrvcfcole","PROGSRVCGCOLD": "progsrvcgcold","PROGSRVCGCOLE": "progsrvcgcole","MEMBERSHPDUESD": "membershpduesd","MEMBERSHPDUESE": "membershpduese","INTONSVNGSD": "intonsvngsd","INTONSVNGSE": "intonsvngse","DVDNDSINTD": "dvdndsintd","DVDNDSINTE": "dvdndsinte","TRNSFRCASHCD": "trnsfrcashcd","TRNSOTHASSTSCD": "trnsothasstscd","SALESASSTSCD": "salesasstscd","PRCHSASSTSCD": "prchsasstscd","RENTLSFACLTSCD": "rentlsfacltscd","REIMBRSMNTSCD": "reimbrsmntscd","LOANSGUARCD": "loansguarcd","PERFSERVICESCD": "perfservicescd","SHARNGASSTSCD": "sharngasstscd"}', }, - resources={"request_memory": "2G", "request_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/irs_990/pipelines/irs_990_pf_2014/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_pf_2014/pipeline.yaml index a545fd911..81d5431c0 100644 --- a/datasets/irs_990/pipelines/irs_990_pf_2014/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_pf_2014/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_pf_2014 - - # Description of the table description: "IRS 990 PF 2014 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_pf_2014 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -39,29 +33,16 @@ dag: tasks: - operator: "KubernetesPodOperator" - - # Task description description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_pf_2014_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_pf_2014" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/14eofinextract990pf.zip" SOURCE_FILE: "files/data.zip" @@ -74,37 +55,16 @@ dag: RENAME_MAPPINGS: >- {"EIN": "ein","TAX_PRD": "tax_prd","EOSTATUS": "eostatus","TAX_YR": "tax_yr","OPERATINGCD": "operatingcd","SUBCD": "subcd","FAIRMRKTVALAMT": "fairmrktvalamt","GRSCONTRGIFTS": "grscontrgifts","SCHEDBIND": "schedbind","INTRSTRVNUE": "intrstrvnue","DIVIDNDSAMT": "dividndsamt","GRSRENTS": "grsrents","GRSSLSPRAMT": "grsslspramt","COSTSOLD": "costsold","GRSPROFITBUS": "grsprofitbus","OTHERINCAMT": "otherincamt","TOTRCPTPERBKS": "totrcptperbks","COMPOFFICERS": "compofficers","PENSPLEMPLBENF": "pensplemplbenf","LEGALFEESAMT": "legalfeesamt","ACCOUNTINGFEES": "accountingfees","INTERESTAMT": "interestamt","DEPRECIATIONAMT": "depreciationamt","OCCUPANCYAMT": "occupancyamt","TRAVLCONFMTNGS": "travlconfmtngs","PRINTINGPUBL": "printingpubl","TOPRADMNEXPNSA": "topradmnexpnsa","CONTRPDPBKS": "contrpdpbks","TOTEXPNSPBKS": "totexpnspbks","EXCESSRCPTS": "excessrcpts","TOTRCPTNETINC": "totrcptnetinc","TOPRADMNEXPNSB": "topradmnexpnsb","TOTEXPNSNETINC": "totexpnsnetinc","NETINVSTINC": "netinvstinc","TRCPTADJNETINC": "trcptadjnetinc","TOTEXPNSADJNET": "totexpnsadjnet","ADJNETINC": "adjnetinc","TOPRADMNEXPNSD": "topradmnexpnsd","TOTEXPNSEXEMPT": "totexpnsexempt","OTHRCASHAMT": "othrcashamt","INVSTGOVTOBLIG": "invstgovtoblig","INVSTCORPSTK": "invstcorpstk","INVSTCORPBND": "invstcorpbnd","TOTINVSTSEC": "totinvstsec","MRTGLOANS": "mrtgloans","OTHRINVSTEND": "othrinvstend","OTHRASSETSEOY": "othrassetseoy","TOTASSETSEND": "totassetsend","MRTGNOTESPAY": "mrtgnotespay","OTHRLIABLTSEOY": "othrliabltseoy","TOTLIABEND": "totliabend","TFUNDNWORTH": "tfundnworth","FAIRMRKTVALEOY": "fairmrktvaleoy","TOTEXCAPGNLS": "totexcapgnls","TOTEXCAPGN": "totexcapgn","TOTEXCAPLS": "totexcapls","INVSTEXCISETX": "invstexcisetx","SEC4940NOTXCD": "sec4940notxcd","SEC4940REDTXCD": "sec4940redtxcd","SECT511TX": "sect511tx","SUBTITLEATX": "subtitleatx","TOTAXPYR": "totaxpyr","ESTTAXCR": "esttaxcr","TXWITHLDSRC": "txwithldsrc","TXPAIDF2758": "txpaidf2758","ERRONBKUPWTHLD": "erronbkupwthld","ESTPNLTY": "estpnlty","TAXDUE": "taxdue","OVERPAY": "overpay","CRELAMT": "crelamt","INFLEG": "infleg","ACTNOTPR": "actnotpr","CHGNPRVRPTCD": "chgnprvrptcd","FILEDF990TCD": "filedf990tcd","CONTRACTNCD": "contractncd","FURNISHCPYCD": "furnishcpycd","CLAIMSTATCD": "claimstatcd","CNTRBTRSTXYRCD": "cntrbtrstxyrcd","ACQDRINDRINTCD": "acqdrindrintcd","ORGCMPLYPUBCD": "orgcmplypubcd","FILEDLF1041IND": "filedlf1041ind","PROPEXCHCD": "propexchcd","BRWLNDMNYCD": "brwlndmnycd","FURNGOODSCD": "furngoodscd","PAIDCMPNCD": "paidcmpncd","TRANSFERCD": "transfercd","AGREMKPAYCD": "agremkpaycd","EXCEPTACTSIND": "exceptactsind","PRIORACTVCD": "prioractvcd","UNDISTRINCCD": "undistrinccd","APPLYPROVIND": "applyprovind","DIRINDIRINTCD": "dirindirintcd","EXCESSHLDCD": "excesshldcd","INVSTJEXMPTCD": "invstjexmptcd","PREVJEXMPTCD": "prevjexmptcd","PROPGNDACD": "propgndacd","IPUBELECTCD": "ipubelectcd","GRNTINDIVCD": "grntindivcd","NCHRTYGRNTCD": "nchrtygrntcd","NRELIGIOUSCD": "nreligiouscd","EXCPTRANSIND": "excptransind","RFPRSNLBNFTIND": "rfprsnlbnftind","PYPRSNLBNFTIND": "pyprsnlbnftind","TFAIRMRKTUNUSE": "tfairmrktunuse","VALNCHARITASSETS": "valncharitassets","CMPMININVSTRET": "cmpmininvstret","DISTRIBAMT": "distribamt","UNDISTRIBINCYR": "undistribincyr","ADJNETINCCOLA": "adjnetinccola","ADJNETINCCOLB": "adjnetinccolb","ADJNETINCCOLC": "adjnetinccolc","ADJNETINCCOLD": "adjnetinccold","ADJNETINCTOT": "adjnetinctot","QLFYDISTRIBA": "qlfydistriba","QLFYDISTRIBB": "qlfydistribb","QLFYDISTRIBC": "qlfydistribc","QLFYDISTRIBD": "qlfydistribd","QLFYDISTRIBTOT": "qlfydistribtot","VALASSETSCOLA": "valassetscola","VALASSETSCOLB": "valassetscolb","VALASSETSCOLC": "valassetscolc","VALASSETSCOLD": "valassetscold","VALASSETSTOT": "valassetstot","QLFYASSETA": "qlfyasseta","QLFYASSETB": "qlfyassetb","QLFYASSETC": "qlfyassetc","QLFYASSETD": "qlfyassetd","QLFYASSETTOT": "qlfyassettot","ENDWMNTSCOLA": "endwmntscola","ENDWMNTSCOLB": "endwmntscolb","ENDWMNTSCOLC": "endwmntscolc","ENDWMNTSCOLD": "endwmntscold","ENDWMNTSTOT": "endwmntstot","TOTSUPRTCOLA": "totsuprtcola","TOTSUPRTCOLB": "totsuprtcolb","TOTSUPRTCOLC": "totsuprtcolc","TOTSUPRTCOLD": "totsuprtcold","TOTSUPRTTOT": "totsuprttot","PUBSUPRTCOLA": "pubsuprtcola","PUBSUPRTCOLB": "pubsuprtcolb","PUBSUPRTCOLC": "pubsuprtcolc","PUBSUPRTCOLD": "pubsuprtcold","PUBSUPRTTOT": "pubsuprttot","GRSINVSTINCA": "grsinvstinca","GRSINVSTINCB": "grsinvstincb","GRSINVSTINCC": "grsinvstincc","GRSINVSTINCD": "grsinvstincd","GRSINVSTINCTOT": "grsinvstinctot","GRNTAPPRVFUT": "grntapprvfut","PROGSRVCACOLD": "progsrvcacold","PROGSRVCACOLE": "progsrvcacole","PROGSRVCBCOLD": "progsrvcbcold","PROGSRVCBCOLE": "progsrvcbcole","PROGSRVCCCOLD": "progsrvcccold","PROGSRVCCCOLE": "progsrvcccole","PROGSRVCDCOLD": "progsrvcdcold","PROGSRVCDCOLE": "progsrvcdcole","PROGSRVCECOLD": "progsrvcecold","PROGSRVCECOLE": "progsrvcecole","PROGSRVCFCOLD": "progsrvcfcold","PROGSRVCFCOLE": "progsrvcfcole","PROGSRVCGCOLD": "progsrvcgcold","PROGSRVCGCOLE": "progsrvcgcole","MEMBERSHPDUESD": "membershpduesd","MEMBERSHPDUESE": "membershpduese","INTONSVNGSD": "intonsvngsd","INTONSVNGSE": "intonsvngse","DVDNDSINTD": "dvdndsintd","DVDNDSINTE": "dvdndsinte","TRNSFRCASHCD": "trnsfrcashcd","TRNSOTHASSTSCD": "trnsothasstscd","SALESASSTSCD": "salesasstscd","PRCHSASSTSCD": "prchsasstscd","RENTLSFACLTSCD": "rentlsfacltscd","REIMBRSMNTSCD": "reimbrsmntscd","LOANSGUARCD": "loansguarcd","PERFSERVICESCD": "perfservicescd","SHARNGASSTSCD": "sharngasstscd"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: task_id: "load_irs_990_pf_2014_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_pf_2014/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_pf_2014" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" diff --git a/datasets/irs_990/pipelines/irs_990_pf_2015/irs_990_pf_2015_dag.py b/datasets/irs_990/pipelines/irs_990_pf_2015/irs_990_pf_2015_dag.py index 0a1d2198d..b7b0b4e62 100644 --- a/datasets/irs_990/pipelines/irs_990_pf_2015/irs_990_pf_2015_dag.py +++ b/datasets/irs_990/pipelines/irs_990_pf_2015/irs_990_pf_2015_dag.py @@ -38,8 +38,9 @@ task_id="irs_990_pf_2015_transform_csv", startup_timeout_seconds=600, name="irs_990_pf_2015", - service_account_name="datasets", - namespace="composer", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +53,6 @@ "CSV_HEADERS": '["ein","elf","tax_prd","eostatus","tax_yr","operatingcd","subcd","fairmrktvalamt","grscontrgifts","schedbind","intrstrvnue","dividndsamt","grsrents","grsslspramt","costsold","grsprofitbus","otherincamt","totrcptperbks","compofficers","pensplemplbenf","legalfeesamt","accountingfees","interestamt","depreciationamt","occupancyamt","travlconfmtngs","printingpubl","topradmnexpnsa","contrpdpbks","totexpnspbks","excessrcpts","totrcptnetinc","topradmnexpnsb","totexpnsnetinc","netinvstinc","trcptadjnetinc","totexpnsadjnet","adjnetinc","topradmnexpnsd","totexpnsexempt","othrcashamt","invstgovtoblig","invstcorpstk","invstcorpbnd","totinvstsec","mrtgloans","othrinvstend","othrassetseoy","totassetsend","mrtgnotespay","othrliabltseoy","totliabend","tfundnworth","fairmrktvaleoy","totexcapgnls","totexcapgn","totexcapls","invstexcisetx","sec4940notxcd","sec4940redtxcd","sect511tx","subtitleatx","totaxpyr","esttaxcr","txwithldsrc","txpaidf2758","erronbkupwthld","estpnlty","taxdue","overpay","crelamt","infleg","actnotpr","chgnprvrptcd","filedf990tcd","contractncd","furnishcpycd","claimstatcd","cntrbtrstxyrcd","distribdafcd","orgcmplypubcd","filedlf1041ind","propexchcd","brwlndmnycd","furngoodscd","paidcmpncd","transfercd","agremkpaycd","exceptactsind","prioractvcd","undistrinccd","applyprovind","dirindirintcd","excesshldcd","invstjexmptcd","prevjexmptcd","propgndacd","ipubelectcd","grntindivcd","nchrtygrntcd","nreligiouscd","excptransind","rfprsnlbnftind","pyprsnlbnftind","tfairmrktunuse","valncharitassets","cmpmininvstret","distribamt","undistribincyr","adjnetinccola","adjnetinccolb","adjnetinccolc","adjnetinccold","adjnetinctot","qlfydistriba","qlfydistribb","qlfydistribc","qlfydistribd","qlfydistribtot","valassetscola","valassetscolb","valassetscolc","valassetscold","valassetstot","qlfyasseta","qlfyassetb","qlfyassetc","qlfyassetd","qlfyassettot","endwmntscola","endwmntscolb","endwmntscolc","endwmntscold","endwmntstot","totsuprtcola","totsuprtcolb","totsuprtcolc","totsuprtcold","totsuprttot","pubsuprtcola","pubsuprtcolb","pubsuprtcolc","pubsuprtcold","pubsuprttot","grsinvstinca","grsinvstincb","grsinvstincc","grsinvstincd","grsinvstinctot","grntapprvfut","progsrvcacold","progsrvcacole","progsrvcbcold","progsrvcbcole","progsrvcccold","progsrvcccole","progsrvcdcold","progsrvcdcole","progsrvcecold","progsrvcecole","progsrvcfcold","progsrvcfcole","progsrvcgcold","progsrvcgcole","membershpduesd","membershpduese","intonsvngsd","intonsvngse","dvdndsintd","dvdndsinte","trnsfrcashcd","trnsothasstscd","salesasstscd","prchsasstscd","rentlsfacltscd","reimbrsmntscd","loansguarcd","perfservicescd","sharngasstscd"]', "RENAME_MAPPINGS": '{"ELF": "elf","ELFCD": "elf","EIN": "ein","TAX_PRD": "tax_prd","EOSTATUS": "eostatus","TAX_YR": "tax_yr","OPERATINGCD": "operatingcd","SUBCD": "subcd","FAIRMRKTVALAMT": "fairmrktvalamt","GRSCONTRGIFTS": "grscontrgifts","SCHEDBIND": "schedbind","INTRSTRVNUE": "intrstrvnue","DIVIDNDSAMT": "dividndsamt","GRSRENTS": "grsrents","GRSSLSPRAMT": "grsslspramt","COSTSOLD": "costsold","GRSPROFITBUS": "grsprofitbus","OTHERINCAMT": "otherincamt","TOTRCPTPERBKS": "totrcptperbks","COMPOFFICERS": "compofficers","PENSPLEMPLBENF": "pensplemplbenf","LEGALFEESAMT": "legalfeesamt","ACCOUNTINGFEES": "accountingfees","INTERESTAMT": "interestamt","DEPRECIATIONAMT": "depreciationamt","OCCUPANCYAMT": "occupancyamt","TRAVLCONFMTNGS": "travlconfmtngs","PRINTINGPUBL": "printingpubl","TOPRADMNEXPNSA": "topradmnexpnsa","CONTRPDPBKS": "contrpdpbks","TOTEXPNSPBKS": "totexpnspbks","EXCESSRCPTS": "excessrcpts","TOTRCPTNETINC": "totrcptnetinc","TOPRADMNEXPNSB": "topradmnexpnsb","TOTEXPNSNETINC": "totexpnsnetinc","NETINVSTINC": "netinvstinc","TRCPTADJNETINC": "trcptadjnetinc","TOTEXPNSADJNET": "totexpnsadjnet","ADJNETINC": "adjnetinc","TOPRADMNEXPNSD": "topradmnexpnsd","TOTEXPNSEXEMPT": "totexpnsexempt","OTHRCASHAMT": "othrcashamt","INVSTGOVTOBLIG": "invstgovtoblig","INVSTCORPSTK": "invstcorpstk","INVSTCORPBND": "invstcorpbnd","TOTINVSTSEC": "totinvstsec","MRTGLOANS": "mrtgloans","OTHRINVSTEND": "othrinvstend","OTHRASSETSEOY": "othrassetseoy","TOTASSETSEND": "totassetsend","MRTGNOTESPAY": "mrtgnotespay","OTHRLIABLTSEOY": "othrliabltseoy","TOTLIABEND": "totliabend","TFUNDNWORTH": "tfundnworth","FAIRMRKTVALEOY": "fairmrktvaleoy","TOTEXCAPGNLS": "totexcapgnls","TOTEXCAPGN": "totexcapgn","TOTEXCAPLS": "totexcapls","INVSTEXCISETX": "invstexcisetx","SEC4940NOTXCD": "sec4940notxcd","SEC4940REDTXCD": "sec4940redtxcd","SECT511TX": "sect511tx","SUBTITLEATX": "subtitleatx","TOTAXPYR": "totaxpyr","ESTTAXCR": "esttaxcr","TXWITHLDSRC": "txwithldsrc","TXPAIDF2758": "txpaidf2758","ERRONBKUPWTHLD": "erronbkupwthld","ESTPNLTY": "estpnlty","TAXDUE": "taxdue","OVERPAY": "overpay","CRELAMT": "crelamt","INFLEG": "infleg","ACTNOTPR": "actnotpr","CHGNPRVRPTCD": "chgnprvrptcd","FILEDF990TCD": "filedf990tcd","CONTRACTNCD": "contractncd","FURNISHCPYCD": "furnishcpycd","CLAIMSTATCD": "claimstatcd","CNTRBTRSTXYRCD": "cntrbtrstxyrcd","DISTRIBDAFCD": "distribdafcd","ACQDRINDRINTCD": "distribdafcd","ORGCMPLYPUBCD": "orgcmplypubcd","FILEDLF1041IND": "filedlf1041ind","PROPEXCHCD": "propexchcd","BRWLNDMNYCD": "brwlndmnycd","FURNGOODSCD": "furngoodscd","PAIDCMPNCD": "paidcmpncd","TRANSFERCD": "transfercd","AGREMKPAYCD": "agremkpaycd","EXCEPTACTSIND": "exceptactsind","PRIORACTVCD": "prioractvcd","UNDISTRINCCD": "undistrinccd","APPLYPROVIND": "applyprovind","DIRINDIRINTCD": "dirindirintcd","EXCESSHLDCD": "excesshldcd","INVSTJEXMPTCD": "invstjexmptcd","PREVJEXMPTCD": "prevjexmptcd","PROPGNDACD": "propgndacd","IPUBELECTCD": "ipubelectcd","GRNTINDIVCD": "grntindivcd","NCHRTYGRNTCD": "nchrtygrntcd","NRELIGIOUSCD": "nreligiouscd","EXCPTRANSIND": "excptransind","RFPRSNLBNFTIND": "rfprsnlbnftind","PYPRSNLBNFTIND": "pyprsnlbnftind","TFAIRMRKTUNUSE": "tfairmrktunuse","VALNCHARITASSETS": "valncharitassets","CMPMININVSTRET": "cmpmininvstret","DISTRIBAMT": "distribamt","UNDISTRIBINCYR": "undistribincyr","ADJNETINCCOLA": "adjnetinccola","ADJNETINCCOLB": "adjnetinccolb","ADJNETINCCOLC": "adjnetinccolc","ADJNETINCCOLD": "adjnetinccold","ADJNETINCTOT": "adjnetinctot","QLFYDISTRIBA": "qlfydistriba","QLFYDISTRIBB": "qlfydistribb","QLFYDISTRIBC": "qlfydistribc","QLFYDISTRIBD": "qlfydistribd","QLFYDISTRIBTOT": "qlfydistribtot","VALASSETSCOLA": "valassetscola","VALASSETSCOLB": "valassetscolb","VALASSETSCOLC": "valassetscolc","VALASSETSCOLD": "valassetscold","VALASSETSTOT": "valassetstot","QLFYASSETA": "qlfyasseta","QLFYASSETB": "qlfyassetb","QLFYASSETC": "qlfyassetc","QLFYASSETD": "qlfyassetd","QLFYASSETTOT": "qlfyassettot","ENDWMNTSCOLA": "endwmntscola","ENDWMNTSCOLB": "endwmntscolb","ENDWMNTSCOLC": "endwmntscolc","ENDWMNTSCOLD": "endwmntscold","ENDWMNTSTOT": "endwmntstot","TOTSUPRTCOLA": "totsuprtcola","TOTSUPRTCOLB": "totsuprtcolb","TOTSUPRTCOLC": "totsuprtcolc","TOTSUPRTCOLD": "totsuprtcold","TOTSUPRTTOT": "totsuprttot","PUBSUPRTCOLA": "pubsuprtcola","PUBSUPRTCOLB": "pubsuprtcolb","PUBSUPRTCOLC": "pubsuprtcolc","PUBSUPRTCOLD": "pubsuprtcold","PUBSUPRTTOT": "pubsuprttot","GRSINVSTINCA": "grsinvstinca","GRSINVSTINCB": "grsinvstincb","GRSINVSTINCC": "grsinvstincc","GRSINVSTINCD": "grsinvstincd","GRSINVSTINCTOT": "grsinvstinctot","GRNTAPPRVFUT": "grntapprvfut","PROGSRVCACOLD": "progsrvcacold","PROGSRVCACOLE": "progsrvcacole","PROGSRVCBCOLD": "progsrvcbcold","PROGSRVCBCOLE": "progsrvcbcole","PROGSRVCCCOLD": "progsrvcccold","PROGSRVCCCOLE": "progsrvcccole","PROGSRVCDCOLD": "progsrvcdcold","PROGSRVCDCOLE": "progsrvcdcole","PROGSRVCECOLD": "progsrvcecold","PROGSRVCECOLE": "progsrvcecole","PROGSRVCFCOLD": "progsrvcfcold","PROGSRVCFCOLE": "progsrvcfcole","PROGSRVCGCOLD": "progsrvcgcold","PROGSRVCGCOLE": "progsrvcgcole","MEMBERSHPDUESD": "membershpduesd","MEMBERSHPDUESE": "membershpduese","INTONSVNGSD": "intonsvngsd","INTONSVNGSE": "intonsvngse","DVDNDSINTD": "dvdndsintd","DVDNDSINTE": "dvdndsinte","TRNSFRCASHCD": "trnsfrcashcd","TRNSOTHASSTSCD": "trnsothasstscd","SALESASSTSCD": "salesasstscd","PRCHSASSTSCD": "prchsasstscd","RENTLSFACLTSCD": "rentlsfacltscd","REIMBRSMNTSCD": "reimbrsmntscd","LOANSGUARCD": "loansguarcd","PERFSERVICESCD": "perfservicescd","SHARNGASSTSCD": "sharngasstscd"}', }, - resources={"request_memory": "2G", "request_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/irs_990/pipelines/irs_990_pf_2015/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_pf_2015/pipeline.yaml index 4497523dc..e7318bdc7 100644 --- a/datasets/irs_990/pipelines/irs_990_pf_2015/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_pf_2015/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_pf_2015 - - # Description of the table description: "IRS 990 PF 2015 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_pf_2015 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -39,29 +33,16 @@ dag: tasks: - operator: "KubernetesPodOperator" - - # Task description description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_pf_2015_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_pf_2015" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/15eofinextract990pf.dat" SOURCE_FILE: "files/data.dat" @@ -74,37 +55,16 @@ dag: RENAME_MAPPINGS: >- {"ELF": "elf","ELFCD": "elf","EIN": "ein","TAX_PRD": "tax_prd","EOSTATUS": "eostatus","TAX_YR": "tax_yr","OPERATINGCD": "operatingcd","SUBCD": "subcd","FAIRMRKTVALAMT": "fairmrktvalamt","GRSCONTRGIFTS": "grscontrgifts","SCHEDBIND": "schedbind","INTRSTRVNUE": "intrstrvnue","DIVIDNDSAMT": "dividndsamt","GRSRENTS": "grsrents","GRSSLSPRAMT": "grsslspramt","COSTSOLD": "costsold","GRSPROFITBUS": "grsprofitbus","OTHERINCAMT": "otherincamt","TOTRCPTPERBKS": "totrcptperbks","COMPOFFICERS": "compofficers","PENSPLEMPLBENF": "pensplemplbenf","LEGALFEESAMT": "legalfeesamt","ACCOUNTINGFEES": "accountingfees","INTERESTAMT": "interestamt","DEPRECIATIONAMT": "depreciationamt","OCCUPANCYAMT": "occupancyamt","TRAVLCONFMTNGS": "travlconfmtngs","PRINTINGPUBL": "printingpubl","TOPRADMNEXPNSA": "topradmnexpnsa","CONTRPDPBKS": "contrpdpbks","TOTEXPNSPBKS": "totexpnspbks","EXCESSRCPTS": "excessrcpts","TOTRCPTNETINC": "totrcptnetinc","TOPRADMNEXPNSB": "topradmnexpnsb","TOTEXPNSNETINC": "totexpnsnetinc","NETINVSTINC": "netinvstinc","TRCPTADJNETINC": "trcptadjnetinc","TOTEXPNSADJNET": "totexpnsadjnet","ADJNETINC": "adjnetinc","TOPRADMNEXPNSD": "topradmnexpnsd","TOTEXPNSEXEMPT": "totexpnsexempt","OTHRCASHAMT": "othrcashamt","INVSTGOVTOBLIG": "invstgovtoblig","INVSTCORPSTK": "invstcorpstk","INVSTCORPBND": "invstcorpbnd","TOTINVSTSEC": "totinvstsec","MRTGLOANS": "mrtgloans","OTHRINVSTEND": "othrinvstend","OTHRASSETSEOY": "othrassetseoy","TOTASSETSEND": "totassetsend","MRTGNOTESPAY": "mrtgnotespay","OTHRLIABLTSEOY": "othrliabltseoy","TOTLIABEND": "totliabend","TFUNDNWORTH": "tfundnworth","FAIRMRKTVALEOY": "fairmrktvaleoy","TOTEXCAPGNLS": "totexcapgnls","TOTEXCAPGN": "totexcapgn","TOTEXCAPLS": "totexcapls","INVSTEXCISETX": "invstexcisetx","SEC4940NOTXCD": "sec4940notxcd","SEC4940REDTXCD": "sec4940redtxcd","SECT511TX": "sect511tx","SUBTITLEATX": "subtitleatx","TOTAXPYR": "totaxpyr","ESTTAXCR": "esttaxcr","TXWITHLDSRC": "txwithldsrc","TXPAIDF2758": "txpaidf2758","ERRONBKUPWTHLD": "erronbkupwthld","ESTPNLTY": "estpnlty","TAXDUE": "taxdue","OVERPAY": "overpay","CRELAMT": "crelamt","INFLEG": "infleg","ACTNOTPR": "actnotpr","CHGNPRVRPTCD": "chgnprvrptcd","FILEDF990TCD": "filedf990tcd","CONTRACTNCD": "contractncd","FURNISHCPYCD": "furnishcpycd","CLAIMSTATCD": "claimstatcd","CNTRBTRSTXYRCD": "cntrbtrstxyrcd","DISTRIBDAFCD": "distribdafcd","ACQDRINDRINTCD": "distribdafcd","ORGCMPLYPUBCD": "orgcmplypubcd","FILEDLF1041IND": "filedlf1041ind","PROPEXCHCD": "propexchcd","BRWLNDMNYCD": "brwlndmnycd","FURNGOODSCD": "furngoodscd","PAIDCMPNCD": "paidcmpncd","TRANSFERCD": "transfercd","AGREMKPAYCD": "agremkpaycd","EXCEPTACTSIND": "exceptactsind","PRIORACTVCD": "prioractvcd","UNDISTRINCCD": "undistrinccd","APPLYPROVIND": "applyprovind","DIRINDIRINTCD": "dirindirintcd","EXCESSHLDCD": "excesshldcd","INVSTJEXMPTCD": "invstjexmptcd","PREVJEXMPTCD": "prevjexmptcd","PROPGNDACD": "propgndacd","IPUBELECTCD": "ipubelectcd","GRNTINDIVCD": "grntindivcd","NCHRTYGRNTCD": "nchrtygrntcd","NRELIGIOUSCD": "nreligiouscd","EXCPTRANSIND": "excptransind","RFPRSNLBNFTIND": "rfprsnlbnftind","PYPRSNLBNFTIND": "pyprsnlbnftind","TFAIRMRKTUNUSE": "tfairmrktunuse","VALNCHARITASSETS": "valncharitassets","CMPMININVSTRET": "cmpmininvstret","DISTRIBAMT": "distribamt","UNDISTRIBINCYR": "undistribincyr","ADJNETINCCOLA": "adjnetinccola","ADJNETINCCOLB": "adjnetinccolb","ADJNETINCCOLC": "adjnetinccolc","ADJNETINCCOLD": "adjnetinccold","ADJNETINCTOT": "adjnetinctot","QLFYDISTRIBA": "qlfydistriba","QLFYDISTRIBB": "qlfydistribb","QLFYDISTRIBC": "qlfydistribc","QLFYDISTRIBD": "qlfydistribd","QLFYDISTRIBTOT": "qlfydistribtot","VALASSETSCOLA": "valassetscola","VALASSETSCOLB": "valassetscolb","VALASSETSCOLC": "valassetscolc","VALASSETSCOLD": "valassetscold","VALASSETSTOT": "valassetstot","QLFYASSETA": "qlfyasseta","QLFYASSETB": "qlfyassetb","QLFYASSETC": "qlfyassetc","QLFYASSETD": "qlfyassetd","QLFYASSETTOT": "qlfyassettot","ENDWMNTSCOLA": "endwmntscola","ENDWMNTSCOLB": "endwmntscolb","ENDWMNTSCOLC": "endwmntscolc","ENDWMNTSCOLD": "endwmntscold","ENDWMNTSTOT": "endwmntstot","TOTSUPRTCOLA": "totsuprtcola","TOTSUPRTCOLB": "totsuprtcolb","TOTSUPRTCOLC": "totsuprtcolc","TOTSUPRTCOLD": "totsuprtcold","TOTSUPRTTOT": "totsuprttot","PUBSUPRTCOLA": "pubsuprtcola","PUBSUPRTCOLB": "pubsuprtcolb","PUBSUPRTCOLC": "pubsuprtcolc","PUBSUPRTCOLD": "pubsuprtcold","PUBSUPRTTOT": "pubsuprttot","GRSINVSTINCA": "grsinvstinca","GRSINVSTINCB": "grsinvstincb","GRSINVSTINCC": "grsinvstincc","GRSINVSTINCD": "grsinvstincd","GRSINVSTINCTOT": "grsinvstinctot","GRNTAPPRVFUT": "grntapprvfut","PROGSRVCACOLD": "progsrvcacold","PROGSRVCACOLE": "progsrvcacole","PROGSRVCBCOLD": "progsrvcbcold","PROGSRVCBCOLE": "progsrvcbcole","PROGSRVCCCOLD": "progsrvcccold","PROGSRVCCCOLE": "progsrvcccole","PROGSRVCDCOLD": "progsrvcdcold","PROGSRVCDCOLE": "progsrvcdcole","PROGSRVCECOLD": "progsrvcecold","PROGSRVCECOLE": "progsrvcecole","PROGSRVCFCOLD": "progsrvcfcold","PROGSRVCFCOLE": "progsrvcfcole","PROGSRVCGCOLD": "progsrvcgcold","PROGSRVCGCOLE": "progsrvcgcole","MEMBERSHPDUESD": "membershpduesd","MEMBERSHPDUESE": "membershpduese","INTONSVNGSD": "intonsvngsd","INTONSVNGSE": "intonsvngse","DVDNDSINTD": "dvdndsintd","DVDNDSINTE": "dvdndsinte","TRNSFRCASHCD": "trnsfrcashcd","TRNSOTHASSTSCD": "trnsothasstscd","SALESASSTSCD": "salesasstscd","PRCHSASSTSCD": "prchsasstscd","RENTLSFACLTSCD": "rentlsfacltscd","REIMBRSMNTSCD": "reimbrsmntscd","LOANSGUARCD": "loansguarcd","PERFSERVICESCD": "perfservicescd","SHARNGASSTSCD": "sharngasstscd"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: task_id: "load_irs_990_pf_2015_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_pf_2015/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_pf_2015" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string" diff --git a/datasets/irs_990/pipelines/irs_990_pf_2016/irs_990_pf_2016_dag.py b/datasets/irs_990/pipelines/irs_990_pf_2016/irs_990_pf_2016_dag.py index 075635980..93f1054a7 100644 --- a/datasets/irs_990/pipelines/irs_990_pf_2016/irs_990_pf_2016_dag.py +++ b/datasets/irs_990/pipelines/irs_990_pf_2016/irs_990_pf_2016_dag.py @@ -38,8 +38,9 @@ task_id="irs_990_pf_2016_transform_csv", startup_timeout_seconds=600, name="irs_990_pf_2016", - service_account_name="datasets", - namespace="composer", + namespace="composer-user-workloads", + service_account_name="default", + config_file="/home/airflow/composer_kube_config", image_pull_policy="Always", image="{{ var.json.irs_990.container_registry.run_csv_transform_kub }}", env_vars={ @@ -52,7 +53,6 @@ "CSV_HEADERS": '["ein","elf","tax_prd","eostatus","tax_yr","operatingcd","subcd","fairmrktvalamt","grscontrgifts","schedbind","intrstrvnue","dividndsamt","grsrents","grsslspramt","costsold","grsprofitbus","otherincamt","totrcptperbks","compofficers","pensplemplbenf","legalfeesamt","accountingfees","interestamt","depreciationamt","occupancyamt","travlconfmtngs","printingpubl","topradmnexpnsa","contrpdpbks","totexpnspbks","excessrcpts","totrcptnetinc","topradmnexpnsb","totexpnsnetinc","netinvstinc","trcptadjnetinc","totexpnsadjnet","adjnetinc","topradmnexpnsd","totexpnsexempt","othrcashamt","invstgovtoblig","invstcorpstk","invstcorpbnd","totinvstsec","mrtgloans","othrinvstend","othrassetseoy","totassetsend","mrtgnotespay","othrliabltseoy","totliabend","tfundnworth","fairmrktvaleoy","totexcapgnls","totexcapgn","totexcapls","invstexcisetx","sec4940notxcd","sec4940redtxcd","sect511tx","subtitleatx","totaxpyr","esttaxcr","txwithldsrc","txpaidf2758","erronbkupwthld","estpnlty","taxdue","overpay","crelamt","infleg","actnotpr","chgnprvrptcd","filedf990tcd","contractncd","furnishcpycd","claimstatcd","cntrbtrstxyrcd","distribdafcd","orgcmplypubcd","filedlf1041ind","propexchcd","brwlndmnycd","furngoodscd","paidcmpncd","transfercd","agremkpaycd","exceptactsind","prioractvcd","undistrinccd","applyprovind","dirindirintcd","excesshldcd","invstjexmptcd","prevjexmptcd","propgndacd","ipubelectcd","grntindivcd","nchrtygrntcd","nreligiouscd","excptransind","rfprsnlbnftind","pyprsnlbnftind","tfairmrktunuse","valncharitassets","cmpmininvstret","distribamt","undistribincyr","adjnetinccola","adjnetinccolb","adjnetinccolc","adjnetinccold","adjnetinctot","qlfydistriba","qlfydistribb","qlfydistribc","qlfydistribd","qlfydistribtot","valassetscola","valassetscolb","valassetscolc","valassetscold","valassetstot","qlfyasseta","qlfyassetb","qlfyassetc","qlfyassetd","qlfyassettot","endwmntscola","endwmntscolb","endwmntscolc","endwmntscold","endwmntstot","totsuprtcola","totsuprtcolb","totsuprtcolc","totsuprtcold","totsuprttot","pubsuprtcola","pubsuprtcolb","pubsuprtcolc","pubsuprtcold","pubsuprttot","grsinvstinca","grsinvstincb","grsinvstincc","grsinvstincd","grsinvstinctot","grntapprvfut","progsrvcacold","progsrvcacole","progsrvcbcold","progsrvcbcole","progsrvcccold","progsrvcccole","progsrvcdcold","progsrvcdcole","progsrvcecold","progsrvcecole","progsrvcfcold","progsrvcfcole","progsrvcgcold","progsrvcgcole","membershpduesd","membershpduese","intonsvngsd","intonsvngse","dvdndsintd","dvdndsinte","trnsfrcashcd","trnsothasstscd","salesasstscd","prchsasstscd","rentlsfacltscd","reimbrsmntscd","loansguarcd","perfservicescd","sharngasstscd"]', "RENAME_MAPPINGS": '{"ELF": "elf","ELFCD": "elf","EIN": "ein","TAX_PRD": "tax_prd","EOSTATUS": "eostatus","TAX_YR": "tax_yr","OPERATINGCD": "operatingcd","SUBCD": "subcd","FAIRMRKTVALAMT": "fairmrktvalamt","GRSCONTRGIFTS": "grscontrgifts","SCHEDBIND": "schedbind","INTRSTRVNUE": "intrstrvnue","DIVIDNDSAMT": "dividndsamt","GRSRENTS": "grsrents","GRSSLSPRAMT": "grsslspramt","COSTSOLD": "costsold","GRSPROFITBUS": "grsprofitbus","OTHERINCAMT": "otherincamt","TOTRCPTPERBKS": "totrcptperbks","COMPOFFICERS": "compofficers","PENSPLEMPLBENF": "pensplemplbenf","LEGALFEESAMT": "legalfeesamt","ACCOUNTINGFEES": "accountingfees","INTERESTAMT": "interestamt","DEPRECIATIONAMT": "depreciationamt","OCCUPANCYAMT": "occupancyamt","TRAVLCONFMTNGS": "travlconfmtngs","PRINTINGPUBL": "printingpubl","TOPRADMNEXPNSA": "topradmnexpnsa","CONTRPDPBKS": "contrpdpbks","TOTEXPNSPBKS": "totexpnspbks","EXCESSRCPTS": "excessrcpts","TOTRCPTNETINC": "totrcptnetinc","TOPRADMNEXPNSB": "topradmnexpnsb","TOTEXPNSNETINC": "totexpnsnetinc","NETINVSTINC": "netinvstinc","TRCPTADJNETINC": "trcptadjnetinc","TOTEXPNSADJNET": "totexpnsadjnet","ADJNETINC": "adjnetinc","TOPRADMNEXPNSD": "topradmnexpnsd","TOTEXPNSEXEMPT": "totexpnsexempt","OTHRCASHAMT": "othrcashamt","INVSTGOVTOBLIG": "invstgovtoblig","INVSTCORPSTK": "invstcorpstk","INVSTCORPBND": "invstcorpbnd","TOTINVSTSEC": "totinvstsec","MRTGLOANS": "mrtgloans","OTHRINVSTEND": "othrinvstend","OTHRASSETSEOY": "othrassetseoy","TOTASSETSEND": "totassetsend","MRTGNOTESPAY": "mrtgnotespay","OTHRLIABLTSEOY": "othrliabltseoy","TOTLIABEND": "totliabend","TFUNDNWORTH": "tfundnworth","FAIRMRKTVALEOY": "fairmrktvaleoy","TOTEXCAPGNLS": "totexcapgnls","TOTEXCAPGN": "totexcapgn","TOTEXCAPLS": "totexcapls","INVSTEXCISETX": "invstexcisetx","SEC4940NOTXCD": "sec4940notxcd","SEC4940REDTXCD": "sec4940redtxcd","SECT511TX": "sect511tx","SUBTITLEATX": "subtitleatx","TOTAXPYR": "totaxpyr","ESTTAXCR": "esttaxcr","TXWITHLDSRC": "txwithldsrc","TXPAIDF2758": "txpaidf2758","ERRONBKUPWTHLD": "erronbkupwthld","ESTPNLTY": "estpnlty","TAXDUE": "taxdue","OVERPAY": "overpay","CRELAMT": "crelamt","INFLEG": "infleg","ACTNOTPR": "actnotpr","CHGNPRVRPTCD": "chgnprvrptcd","FILEDF990TCD": "filedf990tcd","CONTRACTNCD": "contractncd","FURNISHCPYCD": "furnishcpycd","CLAIMSTATCD": "claimstatcd","CNTRBTRSTXYRCD": "cntrbtrstxyrcd","DISTRIBDAFCD": "distribdafcd","ACQDRINDRINTCD": "distribdafcd","ORGCMPLYPUBCD": "orgcmplypubcd","FILEDLF1041IND": "filedlf1041ind","PROPEXCHCD": "propexchcd","BRWLNDMNYCD": "brwlndmnycd","FURNGOODSCD": "furngoodscd","PAIDCMPNCD": "paidcmpncd","TRANSFERCD": "transfercd","AGREMKPAYCD": "agremkpaycd","EXCEPTACTSIND": "exceptactsind","PRIORACTVCD": "prioractvcd","UNDISTRINCCD": "undistrinccd","APPLYPROVIND": "applyprovind","DIRINDIRINTCD": "dirindirintcd","EXCESSHLDCD": "excesshldcd","INVSTJEXMPTCD": "invstjexmptcd","PREVJEXMPTCD": "prevjexmptcd","PROPGNDACD": "propgndacd","IPUBELECTCD": "ipubelectcd","GRNTINDIVCD": "grntindivcd","NCHRTYGRNTCD": "nchrtygrntcd","NRELIGIOUSCD": "nreligiouscd","EXCPTRANSIND": "excptransind","RFPRSNLBNFTIND": "rfprsnlbnftind","PYPRSNLBNFTIND": "pyprsnlbnftind","TFAIRMRKTUNUSE": "tfairmrktunuse","VALNCHARITASSETS": "valncharitassets","CMPMININVSTRET": "cmpmininvstret","DISTRIBAMT": "distribamt","UNDISTRIBINCYR": "undistribincyr","ADJNETINCCOLA": "adjnetinccola","ADJNETINCCOLB": "adjnetinccolb","ADJNETINCCOLC": "adjnetinccolc","ADJNETINCCOLD": "adjnetinccold","ADJNETINCTOT": "adjnetinctot","QLFYDISTRIBA": "qlfydistriba","QLFYDISTRIBB": "qlfydistribb","QLFYDISTRIBC": "qlfydistribc","QLFYDISTRIBD": "qlfydistribd","QLFYDISTRIBTOT": "qlfydistribtot","VALASSETSCOLA": "valassetscola","VALASSETSCOLB": "valassetscolb","VALASSETSCOLC": "valassetscolc","VALASSETSCOLD": "valassetscold","VALASSETSTOT": "valassetstot","QLFYASSETA": "qlfyasseta","QLFYASSETB": "qlfyassetb","QLFYASSETC": "qlfyassetc","QLFYASSETD": "qlfyassetd","QLFYASSETTOT": "qlfyassettot","ENDWMNTSCOLA": "endwmntscola","ENDWMNTSCOLB": "endwmntscolb","ENDWMNTSCOLC": "endwmntscolc","ENDWMNTSCOLD": "endwmntscold","ENDWMNTSTOT": "endwmntstot","TOTSUPRTCOLA": "totsuprtcola","TOTSUPRTCOLB": "totsuprtcolb","TOTSUPRTCOLC": "totsuprtcolc","TOTSUPRTCOLD": "totsuprtcold","TOTSUPRTTOT": "totsuprttot","PUBSUPRTCOLA": "pubsuprtcola","PUBSUPRTCOLB": "pubsuprtcolb","PUBSUPRTCOLC": "pubsuprtcolc","PUBSUPRTCOLD": "pubsuprtcold","PUBSUPRTTOT": "pubsuprttot","GRSINVSTINCA": "grsinvstinca","GRSINVSTINCB": "grsinvstincb","GRSINVSTINCC": "grsinvstincc","GRSINVSTINCD": "grsinvstincd","GRSINVSTINCTOT": "grsinvstinctot","GRNTAPPRVFUT": "grntapprvfut","PROGSRVCACOLD": "progsrvcacold","PROGSRVCACOLE": "progsrvcacole","PROGSRVCBCOLD": "progsrvcbcold","PROGSRVCBCOLE": "progsrvcbcole","PROGSRVCCCOLD": "progsrvcccold","PROGSRVCCCOLE": "progsrvcccole","PROGSRVCDCOLD": "progsrvcdcold","PROGSRVCDCOLE": "progsrvcdcole","PROGSRVCECOLD": "progsrvcecold","PROGSRVCECOLE": "progsrvcecole","PROGSRVCFCOLD": "progsrvcfcold","PROGSRVCFCOLE": "progsrvcfcole","PROGSRVCGCOLD": "progsrvcgcold","PROGSRVCGCOLE": "progsrvcgcole","MEMBERSHPDUESD": "membershpduesd","MEMBERSHPDUESE": "membershpduese","INTONSVNGSD": "intonsvngsd","INTONSVNGSE": "intonsvngse","DVDNDSINTD": "dvdndsintd","DVDNDSINTE": "dvdndsinte","TRNSFRCASHCD": "trnsfrcashcd","TRNSOTHASSTSCD": "trnsothasstscd","SALESASSTSCD": "salesasstscd","PRCHSASSTSCD": "prchsasstscd","RENTLSFACLTSCD": "rentlsfacltscd","REIMBRSMNTSCD": "reimbrsmntscd","LOANSGUARCD": "loansguarcd","PERFSERVICESCD": "perfservicescd","SHARNGASSTSCD": "sharngasstscd"}', }, - resources={"request_memory": "2G", "request_cpu": "1"}, ) # Task to load CSV data to a BigQuery table diff --git a/datasets/irs_990/pipelines/irs_990_pf_2016/pipeline.yaml b/datasets/irs_990/pipelines/irs_990_pf_2016/pipeline.yaml index 57fac6d8b..fa36a20fe 100644 --- a/datasets/irs_990/pipelines/irs_990_pf_2016/pipeline.yaml +++ b/datasets/irs_990/pipelines/irs_990_pf_2016/pipeline.yaml @@ -14,12 +14,8 @@ --- resources: - - type: bigquery_table - # Required Properties: table_id: irs_990_pf_2016 - - # Description of the table description: "IRS 990 PF 2016 dataset" dag: @@ -28,8 +24,6 @@ dag: dag_id: irs_990_pf_2016 default_args: owner: "Google" - - # When set to True, keeps a task from getting triggered if the previous schedule for the task hasn’t succeeded depends_on_past: False start_date: '2021-03-01' max_active_runs: 1 @@ -39,29 +33,16 @@ dag: tasks: - operator: "KubernetesPodOperator" - - # Task description description: "Run CSV transform within kubernetes pod" - args: - task_id: "irs_990_pf_2016_transform_csv" - startup_timeout_seconds: 600 - - # The name of the pod in which the task will run. This will be used (plus a random suffix) to generate a pod id name: "irs_990_pf_2016" - - # The namespace to run within Kubernetes. Always set its value to "default" because we follow the guideline that KubernetesPodOperator will only be used for very light workloads, i.e. use the Cloud Composer environment's resources without starving other pipelines. - service_account_name: "datasets" - namespace: "composer" - + namespace: "composer-user-workloads" + service_account_name: "default" + config_file: "/home/airflow/composer_kube_config" image_pull_policy: "Always" - - # Docker images will be built and pushed to GCR by default whenever the `scripts/generate_dag.py` is run. To skip building and pushing images, use the optional `--skip-builds` flag. image: "{{ var.json.irs_990.container_registry.run_csv_transform_kub }}" - - # Set the environment variables you need initialized in the container. Use these as input variables for the script your container is expected to perform. env_vars: SOURCE_URL: "https://www.irs.gov/pub/irs-soi/16eofinextract990pf.dat" SOURCE_FILE: "files/data.dat" @@ -73,37 +54,17 @@ dag: ["ein","elf","tax_prd","eostatus","tax_yr","operatingcd","subcd","fairmrktvalamt","grscontrgifts","schedbind","intrstrvnue","dividndsamt","grsrents","grsslspramt","costsold","grsprofitbus","otherincamt","totrcptperbks","compofficers","pensplemplbenf","legalfeesamt","accountingfees","interestamt","depreciationamt","occupancyamt","travlconfmtngs","printingpubl","topradmnexpnsa","contrpdpbks","totexpnspbks","excessrcpts","totrcptnetinc","topradmnexpnsb","totexpnsnetinc","netinvstinc","trcptadjnetinc","totexpnsadjnet","adjnetinc","topradmnexpnsd","totexpnsexempt","othrcashamt","invstgovtoblig","invstcorpstk","invstcorpbnd","totinvstsec","mrtgloans","othrinvstend","othrassetseoy","totassetsend","mrtgnotespay","othrliabltseoy","totliabend","tfundnworth","fairmrktvaleoy","totexcapgnls","totexcapgn","totexcapls","invstexcisetx","sec4940notxcd","sec4940redtxcd","sect511tx","subtitleatx","totaxpyr","esttaxcr","txwithldsrc","txpaidf2758","erronbkupwthld","estpnlty","taxdue","overpay","crelamt","infleg","actnotpr","chgnprvrptcd","filedf990tcd","contractncd","furnishcpycd","claimstatcd","cntrbtrstxyrcd","distribdafcd","orgcmplypubcd","filedlf1041ind","propexchcd","brwlndmnycd","furngoodscd","paidcmpncd","transfercd","agremkpaycd","exceptactsind","prioractvcd","undistrinccd","applyprovind","dirindirintcd","excesshldcd","invstjexmptcd","prevjexmptcd","propgndacd","ipubelectcd","grntindivcd","nchrtygrntcd","nreligiouscd","excptransind","rfprsnlbnftind","pyprsnlbnftind","tfairmrktunuse","valncharitassets","cmpmininvstret","distribamt","undistribincyr","adjnetinccola","adjnetinccolb","adjnetinccolc","adjnetinccold","adjnetinctot","qlfydistriba","qlfydistribb","qlfydistribc","qlfydistribd","qlfydistribtot","valassetscola","valassetscolb","valassetscolc","valassetscold","valassetstot","qlfyasseta","qlfyassetb","qlfyassetc","qlfyassetd","qlfyassettot","endwmntscola","endwmntscolb","endwmntscolc","endwmntscold","endwmntstot","totsuprtcola","totsuprtcolb","totsuprtcolc","totsuprtcold","totsuprttot","pubsuprtcola","pubsuprtcolb","pubsuprtcolc","pubsuprtcold","pubsuprttot","grsinvstinca","grsinvstincb","grsinvstincc","grsinvstincd","grsinvstinctot","grntapprvfut","progsrvcacold","progsrvcacole","progsrvcbcold","progsrvcbcole","progsrvcccold","progsrvcccole","progsrvcdcold","progsrvcdcole","progsrvcecold","progsrvcecole","progsrvcfcold","progsrvcfcole","progsrvcgcold","progsrvcgcole","membershpduesd","membershpduese","intonsvngsd","intonsvngse","dvdndsintd","dvdndsinte","trnsfrcashcd","trnsothasstscd","salesasstscd","prchsasstscd","rentlsfacltscd","reimbrsmntscd","loansguarcd","perfservicescd","sharngasstscd"] RENAME_MAPPINGS: >- {"ELF": "elf","ELFCD": "elf","EIN": "ein","TAX_PRD": "tax_prd","EOSTATUS": "eostatus","TAX_YR": "tax_yr","OPERATINGCD": "operatingcd","SUBCD": "subcd","FAIRMRKTVALAMT": "fairmrktvalamt","GRSCONTRGIFTS": "grscontrgifts","SCHEDBIND": "schedbind","INTRSTRVNUE": "intrstrvnue","DIVIDNDSAMT": "dividndsamt","GRSRENTS": "grsrents","GRSSLSPRAMT": "grsslspramt","COSTSOLD": "costsold","GRSPROFITBUS": "grsprofitbus","OTHERINCAMT": "otherincamt","TOTRCPTPERBKS": "totrcptperbks","COMPOFFICERS": "compofficers","PENSPLEMPLBENF": "pensplemplbenf","LEGALFEESAMT": "legalfeesamt","ACCOUNTINGFEES": "accountingfees","INTERESTAMT": "interestamt","DEPRECIATIONAMT": "depreciationamt","OCCUPANCYAMT": "occupancyamt","TRAVLCONFMTNGS": "travlconfmtngs","PRINTINGPUBL": "printingpubl","TOPRADMNEXPNSA": "topradmnexpnsa","CONTRPDPBKS": "contrpdpbks","TOTEXPNSPBKS": "totexpnspbks","EXCESSRCPTS": "excessrcpts","TOTRCPTNETINC": "totrcptnetinc","TOPRADMNEXPNSB": "topradmnexpnsb","TOTEXPNSNETINC": "totexpnsnetinc","NETINVSTINC": "netinvstinc","TRCPTADJNETINC": "trcptadjnetinc","TOTEXPNSADJNET": "totexpnsadjnet","ADJNETINC": "adjnetinc","TOPRADMNEXPNSD": "topradmnexpnsd","TOTEXPNSEXEMPT": "totexpnsexempt","OTHRCASHAMT": "othrcashamt","INVSTGOVTOBLIG": "invstgovtoblig","INVSTCORPSTK": "invstcorpstk","INVSTCORPBND": "invstcorpbnd","TOTINVSTSEC": "totinvstsec","MRTGLOANS": "mrtgloans","OTHRINVSTEND": "othrinvstend","OTHRASSETSEOY": "othrassetseoy","TOTASSETSEND": "totassetsend","MRTGNOTESPAY": "mrtgnotespay","OTHRLIABLTSEOY": "othrliabltseoy","TOTLIABEND": "totliabend","TFUNDNWORTH": "tfundnworth","FAIRMRKTVALEOY": "fairmrktvaleoy","TOTEXCAPGNLS": "totexcapgnls","TOTEXCAPGN": "totexcapgn","TOTEXCAPLS": "totexcapls","INVSTEXCISETX": "invstexcisetx","SEC4940NOTXCD": "sec4940notxcd","SEC4940REDTXCD": "sec4940redtxcd","SECT511TX": "sect511tx","SUBTITLEATX": "subtitleatx","TOTAXPYR": "totaxpyr","ESTTAXCR": "esttaxcr","TXWITHLDSRC": "txwithldsrc","TXPAIDF2758": "txpaidf2758","ERRONBKUPWTHLD": "erronbkupwthld","ESTPNLTY": "estpnlty","TAXDUE": "taxdue","OVERPAY": "overpay","CRELAMT": "crelamt","INFLEG": "infleg","ACTNOTPR": "actnotpr","CHGNPRVRPTCD": "chgnprvrptcd","FILEDF990TCD": "filedf990tcd","CONTRACTNCD": "contractncd","FURNISHCPYCD": "furnishcpycd","CLAIMSTATCD": "claimstatcd","CNTRBTRSTXYRCD": "cntrbtrstxyrcd","DISTRIBDAFCD": "distribdafcd","ACQDRINDRINTCD": "distribdafcd","ORGCMPLYPUBCD": "orgcmplypubcd","FILEDLF1041IND": "filedlf1041ind","PROPEXCHCD": "propexchcd","BRWLNDMNYCD": "brwlndmnycd","FURNGOODSCD": "furngoodscd","PAIDCMPNCD": "paidcmpncd","TRANSFERCD": "transfercd","AGREMKPAYCD": "agremkpaycd","EXCEPTACTSIND": "exceptactsind","PRIORACTVCD": "prioractvcd","UNDISTRINCCD": "undistrinccd","APPLYPROVIND": "applyprovind","DIRINDIRINTCD": "dirindirintcd","EXCESSHLDCD": "excesshldcd","INVSTJEXMPTCD": "invstjexmptcd","PREVJEXMPTCD": "prevjexmptcd","PROPGNDACD": "propgndacd","IPUBELECTCD": "ipubelectcd","GRNTINDIVCD": "grntindivcd","NCHRTYGRNTCD": "nchrtygrntcd","NRELIGIOUSCD": "nreligiouscd","EXCPTRANSIND": "excptransind","RFPRSNLBNFTIND": "rfprsnlbnftind","PYPRSNLBNFTIND": "pyprsnlbnftind","TFAIRMRKTUNUSE": "tfairmrktunuse","VALNCHARITASSETS": "valncharitassets","CMPMININVSTRET": "cmpmininvstret","DISTRIBAMT": "distribamt","UNDISTRIBINCYR": "undistribincyr","ADJNETINCCOLA": "adjnetinccola","ADJNETINCCOLB": "adjnetinccolb","ADJNETINCCOLC": "adjnetinccolc","ADJNETINCCOLD": "adjnetinccold","ADJNETINCTOT": "adjnetinctot","QLFYDISTRIBA": "qlfydistriba","QLFYDISTRIBB": "qlfydistribb","QLFYDISTRIBC": "qlfydistribc","QLFYDISTRIBD": "qlfydistribd","QLFYDISTRIBTOT": "qlfydistribtot","VALASSETSCOLA": "valassetscola","VALASSETSCOLB": "valassetscolb","VALASSETSCOLC": "valassetscolc","VALASSETSCOLD": "valassetscold","VALASSETSTOT": "valassetstot","QLFYASSETA": "qlfyasseta","QLFYASSETB": "qlfyassetb","QLFYASSETC": "qlfyassetc","QLFYASSETD": "qlfyassetd","QLFYASSETTOT": "qlfyassettot","ENDWMNTSCOLA": "endwmntscola","ENDWMNTSCOLB": "endwmntscolb","ENDWMNTSCOLC": "endwmntscolc","ENDWMNTSCOLD": "endwmntscold","ENDWMNTSTOT": "endwmntstot","TOTSUPRTCOLA": "totsuprtcola","TOTSUPRTCOLB": "totsuprtcolb","TOTSUPRTCOLC": "totsuprtcolc","TOTSUPRTCOLD": "totsuprtcold","TOTSUPRTTOT": "totsuprttot","PUBSUPRTCOLA": "pubsuprtcola","PUBSUPRTCOLB": "pubsuprtcolb","PUBSUPRTCOLC": "pubsuprtcolc","PUBSUPRTCOLD": "pubsuprtcold","PUBSUPRTTOT": "pubsuprttot","GRSINVSTINCA": "grsinvstinca","GRSINVSTINCB": "grsinvstincb","GRSINVSTINCC": "grsinvstincc","GRSINVSTINCD": "grsinvstincd","GRSINVSTINCTOT": "grsinvstinctot","GRNTAPPRVFUT": "grntapprvfut","PROGSRVCACOLD": "progsrvcacold","PROGSRVCACOLE": "progsrvcacole","PROGSRVCBCOLD": "progsrvcbcold","PROGSRVCBCOLE": "progsrvcbcole","PROGSRVCCCOLD": "progsrvcccold","PROGSRVCCCOLE": "progsrvcccole","PROGSRVCDCOLD": "progsrvcdcold","PROGSRVCDCOLE": "progsrvcdcole","PROGSRVCECOLD": "progsrvcecold","PROGSRVCECOLE": "progsrvcecole","PROGSRVCFCOLD": "progsrvcfcold","PROGSRVCFCOLE": "progsrvcfcole","PROGSRVCGCOLD": "progsrvcgcold","PROGSRVCGCOLE": "progsrvcgcole","MEMBERSHPDUESD": "membershpduesd","MEMBERSHPDUESE": "membershpduese","INTONSVNGSD": "intonsvngsd","INTONSVNGSE": "intonsvngse","DVDNDSINTD": "dvdndsintd","DVDNDSINTE": "dvdndsinte","TRNSFRCASHCD": "trnsfrcashcd","TRNSOTHASSTSCD": "trnsothasstscd","SALESASSTSCD": "salesasstscd","PRCHSASSTSCD": "prchsasstscd","RENTLSFACLTSCD": "rentlsfacltscd","REIMBRSMNTSCD": "reimbrsmntscd","LOANSGUARCD": "loansguarcd","PERFSERVICESCD": "perfservicescd","SHARNGASSTSCD": "sharngasstscd"} - # Set resource limits for the pod here. For resource units in Kubernetes, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-units-in-kubernetes - resources: - request_memory: "2G" - request_cpu: "1" - operator: "GoogleCloudStorageToBigQueryOperator" description: "Task to load CSV data to a BigQuery table" - args: task_id: "load_irs_990_pf_2016_to_bq" - - # The GCS bucket where the CSV file is located in. bucket: "{{ var.value.composer_bucket }}" - - # The GCS object path for the CSV file source_objects: ["data/irs_990/irs_990_pf_2016/data_output.csv"] source_format: "CSV" destination_project_dataset_table: "irs_990.irs_990_pf_2016" - - # Use this if your CSV file contains a header row skip_leading_rows: 1 - - # How to write data to the table: overwrite, append, or write if empty - # See https://cloud.google.com/bigquery/docs/reference/auditlogs/rest/Shared.Types/WriteDisposition write_disposition: "WRITE_TRUNCATE" - - # The BigQuery table schema based on the CSV file. For more info, see - # https://cloud.google.com/bigquery/docs/schemas. - # Always use snake_case and lowercase for column names, and be explicit, - # i.e. specify modes for all columns. - schema_fields: - name: "ein" type: "string"