From 566f425ef258c281e2f8cb2c876902c69324f1ff Mon Sep 17 00:00:00 2001 From: Sourabh Niyogi Date: Fri, 16 Jun 2023 02:00:08 +0000 Subject: [PATCH] ABI-less Decoding of Calls + Event This PR contains new BigQuery UDFs to decode EVM calls and events **without knowing an ABI** using 2 key functions: * `abi_functions.PARSE_CALL(input)` * `abi_functions.PARSE_EVENT(data, topics)` To support ABI-less decoding, each of the above function uses a large ~1MB library containing 2 compressed maps, one covering known call methods and known event topics. Using the call 4byte methodID or the event's topic[0], the most likely ABI is synthesized from the map and decoding is attempted. For events, given the number of topics observed and the ABI guess, any mismatch results in combinatorial search on all possible indexed combinations, with the first success considered valid. How It Works: * Big Picture: approach taken in `PARSE_CALL` and `PARSE_EVENT` is to build a 1MB library of 2 _compressed_ maps: `call_map` and `event_map` * A `signatures` table is compiled by aggregating ABI signatures with 2.8MM+ records from multiple open-source repos. However most of these hex signatures have never bee observed on chain. * So, we tally _actual_ observations (`numObservations` from `logs` and `transactions`) and presence in ABI contracts (`numContracts` from `contracts`) in `crypto_ethereum.{` EVM Chains to build a reduced dataset --- README.md | 274 ++++++++++++++++++++++++++++++++------------ sql/PARSE_CALL.sql | 73 +++++++----- sql/PARSE_EVENT.sql | 170 +++++++++++++++++++-------- 3 files changed, 367 insertions(+), 150 deletions(-) diff --git a/README.md b/README.md index 3ea5345..adf94d4 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,207 @@ # abi-functions -Utility functions for using an ABI to decode EVM calls and events. +This repo contains BigQuery UDFs to decode EVM calls and events **without knowing an ABI** using 2 key functions: -## Examples +* `abi_functions.PARSE_CALL(input)` +* `abi_functions.PARSE_EVENT(data, topics)` -### Example 1 +To support ABI-less decoding, each of the above function uses a large +1MB library containing 2 compressed maps, one covering known call +methods and known event topics. Using the call 4byte methodID or the +event's topic[0], the most likely ABI is synthesized from the map and +decoding is attempted. For events, given the number of topics +observed and the ABI guess, any mismatch results in combinatorial +search on all possible indexed combinations, with the first success +considered valid. + + +### How It Works: +* Big Picture: approach taken in `PARSE_CALL` and `PARSE_EVENT` is to build a 1MB library of 2 _compressed_ maps: `call_map` and `event_map` +* A `signatures` table is compiled by aggregating ABI signatures with 2.8MM+ records from multiple open-source repos. However most of these hex signatures have never been observed on chain. +* So, we tally _actual_ observations (`numObservations` from `logs` and `transactions`) and presence in ABI contracts (`numContracts` from `contracts`) +in `crypto_ethereum.{` EVM Chains to build a reduced dataset +* The current size of this reduced dataset with known ABIs is: + +``` ++-----------------------+--------------+----------+ +| length(hex_signature) | abiAvailable | count(*) | ++-----------------------+--------------+----------+ +| 10 | 20147 | 63600 | +| 66 | 11008 | 11008 | ++-----------------------+--------------+----------+ +``` + +* To maximize decoding rates, we actually only include records with 2+ or more observations in 2023 to fit in a single 1MB map. Potentially the maps could be split into 2 files and increase coverage a bit further. + +### Plan: + +This repo is a work in progress + +* Measure decoding rates and time to decode a day. + +* Rebuild underlying map with GitHub Actions on a weekly basis. We will update the `abi_functions` dataset weekly to support new call methods and new topics. + +* Support EVM chain-specific libraries and EVM ecosystem wide libraries. + +## Key Decoding User-defined Functions, with Examples + +### [PARSE_CALL](sql/PARSE_CALL.sql) + +This function parses `transaction.input` *without* an ABI to produce key/value pairs decoding the input: + +Usage: (single record) +```sql +select `substrate-etl.abi_functions.PARSE_CALL`("0xe343fe12000000000000000000000000bd402a0cf18148389c6f10b6e67aea915c3960ec000000000000000000000000c02aaa39b223fe8d0a0e5c4f27ead9083c756cc20000000000000000000000000258f474786ddfd37abce6df6bbb1dd5dfc4434a0000000000000000000000000000000000000000000000000ed8db3e1827446c00000000000000000000000000000000000000000000000000000032bfdfa8d0") as call_args +``` +returns + +``` +[{ + "call_args": [{ + "name": "fromToken", + "value": "0xbd402A0cf18148389c6f10b6e67aea915C3960ec" + }, { + "name": "toToken", + "value": "0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2" + }, { + "name": "recipient", + "value": "0x0258F474786DdFd37ABCE6df6BBb1Dd5dfC4434a" + }, { + "name": "shareToMin", + "value": "1069845971240174700" + }, { + "name": "shareFrom", + "value": "217967470800" + }] +}] +``` + +This can be used on _arbitrary_ inputs in bulk processing with _varying_ calls / ABIs: + +Usage: (bulk processing) +``` +SELECT + `hash`, + LEFT(input, 10) AS methodID, + input, + `substrate-etl.abi_functions.PARSE_CALL`(input) +FROM + `bigquery-public-data.crypto_ethereum.transactions` AS transactions +WHERE + DATE(block_timestamp) = "2023-06-01" and length(input) >= 10 + limit 2000; +``` + +This can be used on _arbitrary_ inputs in bulk call processing with the same ABI as well: + +Usage: (bulk processing) +```sql +SELECT `substrate-etl.abi_functions`.PARSE_CALL(input) +FROM + `bigquery-public-data.crypto_ethereum.transactions` AS tx +WHERE tx.to_address = '0x084b1c3c81545d370f3634392de611caabff8148' + --filter to method `claimWithResolver(address,address)` + AND tx.input LIKE '0x0f5a5466%' + AND tx.receipt_status = 1 +``` + +### [PARSE_EVENT](sql/PARSE_EVENT.sql) + +This function parses `logs.topics` and `logs.data` **without an ABI** to produce key/value pairs. + +* `PARSE_EVENT(data, topics)` + +Usage: (single record) +``` +select `substrate-etl.abi_functions.PARSE_EVENT`("0x00000000000000000000000000000000000000000000000000036cc84b90729200000000000000000000000000000000000000000000000000000000000000400000000000000000000000000000000000000000000000000000000000000002000000000000000000000000c02aaa39b223fe8d0a0e5c4f27ead9083c756cc2000000000000000000000000323012d0a49fbd86dd1c4d363c7ea63244852e21", ["0x6fd378a9d8b7345c2e5b18229aaf1e39d32b177b501d0a0d26a0a858a23a9624"]) as events +``` + +returns: +```json +[{ + "events": [{ + "name": "from", + "value": "0x40CB4DA705a044016e66dB2E30AdE93EbFe4abD4" + }, { + "name": "to", + "value": "0x9A44630bD49001645291f2A08F4F07eB04bC184a" + }, { + "name": "value", + "value": "50000000000000000000" + }] +}] +``` + +This can be used for bulk processing events with _varying_ ABIs: + +Usage: (bulk processing of 500 records of a day) +``` +SELECT + transaction_hash, + DATA, + topics, + `substrate-etl.abi_functions.PARSE_EVENT`(data, topics) AS events +FROM + `bigquery-public-data.crypto_ethereum.logs` AS logs +WHERE + DATE(block_timestamp) = "2023-06-01" +LIMIT + 500; +``` + +Usage: (bulk processing of `PunkBidEntered` events from calls to the `CryptoPunksMarket` contract: + +```sql +SELECT CryptoPunksMarket_evt_PunkBidEntered(`etherscan`.PARSE_EVENT(lg.data,lg.topics)).* +FROM + `bigquery-public-data.crypto_ethereum.transactions` AS tx + ,`bigquery-public-data.crypto_ethereum.logs` AS lg +WHERE TRUE + AND tx.to_address = '0xb47e3cd837ddf8e4c57f05d70ab865de6e193bbb' + AND lg.address = tx.to_address + AND ARRAY_LENGTH(lg.topics) > 0 + --this is the keccak256 hash of `PunkBidEntered(uint256,uint256,address)` from the ABI + AND lg.topics[OFFSET(0)] = '0x5b859394fabae0c1ba88baffe67e751ab5248d2e879028b8c8d6897b0519f56a' + AND tx.hash = lg.transaction_hash + AND tx.receipt_status = 1 +``` + +### [GENERATE_ABI_FUNCTIONS](sql/GENERATE_ABI_FUNCTIONS.sql) + +Calling this function with a name prefix and an ABI will return two arrays of `CREATE FUNCTION` statements that can be used with a contract that implements the ABI. +- an array for processing method calls (parses `transactions.input`) +- an array for processing log events (parses `logs.topics`) + +Consider the [ABI](abi/CryptoPunksMarket.abi) for the `CryptoPunksMarket` contract at [`0xb47e3cd837ddf8e4c57f05d70ab865de6e193bbb`](https://etherscan.io/address/0xb47e3cd837ddf8e4c57f05d70ab865de6e193bbb). + +We can invoke the code generator as: + +```sql +SELECT `abi_functions`.GENERATE_ABI_FUNCTIONS("CryptoPunksMarket",ABI) +``` + +This produces a number of statments, for example this one for processing calls to the contract's `name()` function: + +```sql +CREATE TEMP FUNCTION CryptoPunksMarket_call_name(kv ARRAY>) RETURNS STRUCT<> LANGUAGE js AS """ +want=new Set([]);res=[];kv.forEach(function(el){if(want.has(el.arg)){res[el.arg]=el.val}});return res; +""" +OPTIONS(library="gs://blockchain-etl-bigquery/ethers.js"); +``` + +and this one for processing `Assign` event logs + +```sql +CREATE TEMP FUNCTION CryptoPunksMarket_evt_Assign(kv ARRAY>) RETURNS STRUCT LANGUAGE js AS """ +want=new Set(['to','punkIndex']);res=[];kv.forEach(function(el){if(want.has(el.arg)){res[el.arg]=el.val}});return res; +""" +OPTIONS(library="gs://blockchain-etl-bigquery/ethers.js"); +``` + +Note that each of these requires an array of key/value pairs, `kv ARRAY>`. + + +#### Example 1 Assuming you have the [ABI](https://github.com/blockchain-etl/abi-functions/blob/main/abi/CryptoPunksMarket.abi) in a table mapping contract to address, this will return a table of all bids to buy CryptoPunks: @@ -42,7 +239,7 @@ Generating a result like: | 728 | 5000000000000 | 0x664e23e4a17a4c7da26c706be5a861c0f7ff569d | | 2207 | 48500000000000000000 | 0x1919db36ca2fa2e15f9000fd9cdc2edcf863e685 | -### Example 2 +#### Example 2 ```sql CREATE TEMP FUNCTION parse_ReverseRegistrar_call_claimWithResolver(kv ARRAY>) RETURNS STRUCT LANGUAGE js AS """want=new Set(['owner','resolver']);res=[];kv.forEach(function(el){if(want.has(el.arg)){res[el.arg]=el.val}});return res;""" OPTIONS(library="gs://blockchain-etl-bigquery/ethers.js"); @@ -70,72 +267,3 @@ Generating a result like: | owner | `0xe11d762cc7b0448ed8c565734b742ce39bbb38a6` | | resolver | `0x0465719485db64e24d73d1619e03950830e4a5b3` | -## Functions in this repo - -### [GENERATE_ABI_FUNCTIONS](sql/GENERATE_ABI_FUNCTIONS.sql) - -Calling this function with a name prefix and an ABI will return two arrays of `CREATE FUNCTION` statements that can be used with a contract that implements the ABI. -- an array for processing method calls (parses `transactions.input`) -- an array for processing log events (parses `logs.topics`) - -Consider the [ABI](abi/CryptoPunksMarket.abi) for the `CryptoPunksMarket` contract at [`0xb47e3cd837ddf8e4c57f05d70ab865de6e193bbb`](https://etherscan.io/address/0xb47e3cd837ddf8e4c57f05d70ab865de6e193bbb). - -We can invoke the code generator as: - -```sql -SELECT `abi_functions`.GENERATE_ABI_FUNCTIONS("CryptoPunksMarket",ABI) -``` - -This produces a number of statments, for example this one for processing calls to the contract's `name()` function: - -```sql -CREATE TEMP FUNCTION CryptoPunksMarket_call_name(kv ARRAY>) RETURNS STRUCT<> LANGUAGE js AS """ -want=new Set([]);res=[];kv.forEach(function(el){if(want.has(el.arg)){res[el.arg]=el.val}});return res; -""" -OPTIONS(library="gs://blockchain-etl-bigquery/ethers.js"); -``` - -and this one for processing `Assign` event logs - -```sql -CREATE TEMP FUNCTION CryptoPunksMarket_evt_Assign(kv ARRAY>) RETURNS STRUCT LANGUAGE js AS """ -want=new Set(['to','punkIndex']);res=[];kv.forEach(function(el){if(want.has(el.arg)){res[el.arg]=el.val}});return res; -""" -OPTIONS(library="gs://blockchain-etl-bigquery/ethers.js"); -``` - -Note that each of these requires an array of key/value pairs, `kv ARRAY>`. - -### [PARSE_CALL](sql/PARSE_CALL.sql) - -This function parses `transaction.input` in the context of an ABI to produce key/value pairs. Here's an example using the ENS ReverseRegistrar contract: - -```sql -SELECT `abi_functions`.PARSE_CALL(ABI,input) -FROM - `bigquery-public-data.crypto_ethereum.transactions` AS tx -WHERE tx.to_address = '0x084b1c3c81545d370f3634392de611caabff8148' - --filter to method `claimWithResolver(address,address)` - AND tx.input LIKE '0x0f5a5466%' - AND tx.receipt_status = 1 -``` - -### [PARSE_EVENT](sql/PARSE_EVENT.sql) - -This function parses `logs.topics` and `logs.data` in the context of an ABI to produce key/value pairs. Here's an example extracting `PunkBidEntered` events from calls to the CryptoPunksMarket contract: - -```sql -SELECT CryptoPunksMarket_evt_PunkBidEntered(`etherscan`.PARSE_EVENT(ABI,'PunkBidEntered',lg.topics,lg.data)).* -FROM - `bigquery-public-data.crypto_ethereum.transactions` AS tx - ,`bigquery-public-data.crypto_ethereum.logs` AS lg -WHERE TRUE - AND tx.to_address = '0xb47e3cd837ddf8e4c57f05d70ab865de6e193bbb' - AND lg.address = tx.to_address - AND ARRAY_LENGTH(lg.topics) > 0 - --this is the keccak256 hash of `PunkBidEntered(uint256,uint256,address)` from the ABI - AND lg.topics[OFFSET(0)] = '0x5b859394fabae0c1ba88baffe67e751ab5248d2e879028b8c8d6897b0519f56a' - AND tx.hash = lg.transaction_hash - AND tx.receipt_status = 1 -``` - diff --git a/sql/PARSE_CALL.sql b/sql/PARSE_CALL.sql index 9f0629e..aa9361c 100644 --- a/sql/PARSE_CALL.sql +++ b/sql/PARSE_CALL.sql @@ -1,35 +1,46 @@ -CREATE OR REPLACE FUNCTION `abi_functions`.PARSE_CALL(abi STRING, dat STRING) RETURNS ARRAY> LANGUAGE js AS """ - abi = JSON.parse(abi); - var interface_instance = new ethers.utils.Interface(abi); - txargs = []; - txtypes = []; - res = []; - - // try-catch - input may be malformed. - try { - var pt = interface_instance.parseTransaction({data: dat}); - - frag = ""; - abi.forEach(function(x){ - if (x.name == pt.name) { - frag = x; - } - }); - txvals = pt.args; - //txtypes[pt.name] - frag.inputs.forEach(function(x) { - txargs.push(x.name); - txtypes.push(x.type); - }); - for (i = 0; i < txvals.length; i++) { - val = txvals[i]; - if (txtypes[i] == 'address') { - val = val.toLowerCase(); +CREATE OR REPLACE FUNCTION `substrate-etl.abi_functions.PARSE_CALL`(data STRING) +RETURNS ARRAY> +LANGUAGE js AS +""" +function convertBN(val) { + if (Array.isArray(val)) { + for (let i = 0; i < val.length; i++) { + val[i] = convertBN(val[i]) + } + } else if ( val._isBigNumber ) { + return val.toString(); + } + return val; +} +try { + let abiencoded = cn_calls[data.substring(0,10)]; + if ( abiencoded ) { + let pieces = abiencoded.split("|"); + if ( pieces.length >= 3 ) { + let functionName = pieces[0]; + // "stateMutability":"nonpayable" + let abi = {"name":functionName, "type":"function","inputs":[], "outputs":[], "stateMutability":"nonpayable"}; + let flds = pieces[1].length > 0 ? pieces[1].split(",") : []; + let types_unindexed = pieces[2].length > 0 ? pieces[2].split(",") : []; + for (let idx = 0; idx < flds.length; idx++) { + abi.inputs.push({name: flds[idx], type: types_unindexed[idx] }); } - res.push({arg:txargs[i],val:val}); + const iface = new ethers.utils.Interface([abi]); + const decodedData = iface.parseTransaction({ data }); + return(abi.inputs.map(function(y, idx) { + let v = decodedData.args[idx]; + return({name: y.name, value: convertBN(v) }); + })); } - } catch(e) {} - return res; + } +} catch (err) { + return [{name: "err", value: `${err.toString()}`}]; +} +return null; """ -OPTIONS(library="gs://blockchain-etl-bigquery/ethers.js"); +OPTIONS( + library = ["gs://cdn.polkaholic.io/pako.min.js", "gs://cdn.polkaholic.io/colorfulnotion-decoderData.js", "gs://blockchain-etl-bigquery/ethers-v5.js"] +); + + diff --git a/sql/PARSE_EVENT.sql b/sql/PARSE_EVENT.sql index 96e0505..296a823 100644 --- a/sql/PARSE_EVENT.sql +++ b/sql/PARSE_EVENT.sql @@ -1,50 +1,128 @@ -CREATE OR REPLACE FUNCTION `abi_functions`.PARSE_EVENT(abi STRING, functionname STRING, logtopics ARRAY, logdata STRING) RETURNS ARRAY> LANGUAGE js AS """ -abi = JSON.parse(abi); - var interface_instance = new ethers.utils.Interface(abi); - // PunkBidEntered (index_topic_1 uint256 punkIndex, uint256 value, index_topic_2 address fromAddress) - // PunkBidEntered(uint256,uint256,address) - // JSON.stringify(pt) - // var topicID = ethers.utils.keccak256(ethers.utils.toUtf8Bytes('PunkBidEntered(uint256,uint256,address)')); - // {"anonymous":false,"inputs":[{"indexed":true,"name":"punkIndex","type":"uint256"},{"indexed":false,"name":"value","type":"uint256"},{"indexed":true,"name":"fromAddress","type":"address"}],"name":"PunkBidEntered","type":"event"} - - var stripZeros = /^0x0+/; - var stripAddress = /^0x0{24}/; - - txargs = []; - txtypes = []; - res = []; - - // try-catch - input may be malformed. - try { -// var pt = interface_instance.parseTransaction({data: dat}); - - frag = ""; - abi.forEach(function(x){ - if (x.name == functionname && x.type == "event") { - frag = x; - } - }); +CREATE OR REPLACE FUNCTION `substrate-etl.abi_functions.PARSE_EVENT`(data STRING, topics ARRAY) +RETURNS ARRAY> +LANGUAGE js AS +""" +function generateBinaryNumbers(N, k) { + const result = []; + function generateCombinations(n, k, prefix = '') { + if (k === 0) { + result.push(prefix.padStart(N, '0')); + return; + } + if (n === 0) return; + generateCombinations(n - 1, k - 1, '1' + prefix); + generateCombinations(n - 1, k, '0'+ prefix); + } + generateCombinations(N, k); + return result; +} +function convertBN(val) { + if (Array.isArray(val)) { + for (let i = 0; i < val.length; i++) { + val[i] = convertBN(val[i]) + } + } else if ( val._isBigNumber ) { + return val.toString(); + } + return val; +} - j=1; - for (i=0; i < frag.inputs.length; i++) { - typ = frag.inputs[i].type; - if (frag.inputs[i].indexed) { - val = logtopics[j]; - j++; +let abiCandidates = []; +try { + let abi_maybe = null; + let text_signature = null; + let abiencoded = cn_events[topics[0].substring(0,10)]; + if ( abiencoded ) { + let pieces = abiencoded.split("|"); + if ( pieces.length == 4 ) { + let eventName = pieces[0]; + let abi = {"name":eventName,"type":"event","inputs":[], "outputs":[], "stateMutability":"nonpayable"}; + let flds = pieces[1].length > 0 ? pieces[1].split(",") : []; + let types_unindexed = pieces[2].length > 0 ? pieces[2].split(",") : []; + let types_indexed = pieces[3].length > 0 ? pieces[3].split(",") : []; + let types = []; + let u = 0; + let i = 0; + for (let idx = 0; idx < flds.length; idx++) { + let f = flds[idx]; + let lastchar = f.substring(f.length-1, f.length); + if ( lastchar == "*" ) { + abi.inputs.push({"indexed": true, name: f.substring(0, f.length-1), type: types_indexed[i] }); + types.push(types_indexed[i]); + i++; + } else { + abi.inputs.push({"indexed": false, name: f, type: types_unindexed[u] }); + types.push(types_unindexed[u]); + u++; + } } - else { - val = logdata - } - if (typ == "uint256") { - val = Number(val.replace(stripZeros, '0x')); - } - else if (typ == "address") { - val = val.toLowerCase(); - val = val.replace(stripAddress, '0x'); - } - res.push({arg:frag.inputs[i].name,val:val}); + abi_maybe = JSON.stringify(abi); + text_signature = `${eventName}(${types.join(",")})` } - } catch(e) {} - return res; + } else { + return null; + } + let abiMaybe = JSON.parse(abi_maybe); + let N = abiMaybe.inputs.length; + let topicLen = 0; + abiMaybe.inputs.forEach(function(x) { + if ( x.indexed ) topicLen++; + }); + if ( topics.length != topicLen + 1 ) { + if ( topics.length == 1 || topics.length == N + 1) { + let v = topics.length == 1 ? false : true; + let cand = JSON.parse(abi_maybe); + for (let i = 0; i < N; i++) { + cand.inputs[i].indexed = v; + } + abiCandidates.push(cand); + } else if ( (topics.length == 2 || topics.length == N ) ){ + let v = ( topics.length == 2 ) ? true : false; + for ( let i = 0; i < N; i++) { + let cand = JSON.parse(abi_maybe); + for (let j = 0; j < N; j++) { + cand.inputs[j].indexed = ( i == j ) ? v : ! v; + } + abiCandidates.push(cand); + } + } else { + let binaryStrings = generateBinaryNumbers(N, topics.length - 1); + for ( const b of binaryStrings ) { + let cand = JSON.parse(abi_maybe); + for (let j = 0; j < b.length; j++) { + cand.inputs[j].indexed = ( b[j] == "1" ) ? true : false; + } + abiCandidates.push(cand); + } + } + + } else { + abiCandidates.push(abiMaybe); + } + let tries = 0; + for (const abiCandidate of abiCandidates) { + try { + const iface = new ethers.utils.Interface([abiCandidate]); + const decodedData = iface.decodeEventLog(ethers.utils.id(text_signature), data, topics); + let res = abiCandidate.inputs.map(function(y, idx){ + let v = decodedData[idx]; + return({name: y.name, value: convertBN(v) }); + }); + if ( tries > 0 ) { + res.push({"name": "tries", "value": tries}); + } + return res; + } catch ( err ) { + tries++; + } + } +} catch (err) { + return [{name: "err", value: `${err.toString()}`}]; +} + +return [{name: "err", value: `decode failure`}]; """ -OPTIONS(library="gs://blockchain-etl-bigquery/ethers.js"); +OPTIONS( + library = ["gs://cdn.polkaholic.io/pako.min.js", "gs://cdn.polkaholic.io/colorfulnotion-decoderData.js", "gs://blockchain-etl-bigquery/ethers-v5.js"] +); +