diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..2092cfa2 --- /dev/null +++ b/404.html @@ -0,0 +1,2128 @@ + + + + + + + + + + + + + + + + + + + + + + + + + The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +
+
+ +
+ A robot eating a stack of pancakes (in various styles) by DALL·E.
+ Some prompts generated by GPT-3. +
+
+
+ +

404 - Page not found.

+ +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/CNAME b/CNAME new file mode 100644 index 00000000..d97d6a69 --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +fullstackdeeplearning.com diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/javascripts/bundle.6c14ae12.min.js b/assets/javascripts/bundle.6c14ae12.min.js new file mode 100644 index 00000000..bbff7768 --- /dev/null +++ b/assets/javascripts/bundle.6c14ae12.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var wi=Object.create;var ur=Object.defineProperty;var Si=Object.getOwnPropertyDescriptor;var Ti=Object.getOwnPropertyNames,kt=Object.getOwnPropertySymbols,Oi=Object.getPrototypeOf,dr=Object.prototype.hasOwnProperty,Zr=Object.prototype.propertyIsEnumerable;var Xr=(e,t,r)=>t in e?ur(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,R=(e,t)=>{for(var r in t||(t={}))dr.call(t,r)&&Xr(e,r,t[r]);if(kt)for(var r of kt(t))Zr.call(t,r)&&Xr(e,r,t[r]);return e};var eo=(e,t)=>{var r={};for(var o in e)dr.call(e,o)&&t.indexOf(o)<0&&(r[o]=e[o]);if(e!=null&&kt)for(var o of kt(e))t.indexOf(o)<0&&Zr.call(e,o)&&(r[o]=e[o]);return r};var hr=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Mi=(e,t,r,o)=>{if(t&&typeof t=="object"||typeof t=="function")for(let n of Ti(t))!dr.call(e,n)&&n!==r&&ur(e,n,{get:()=>t[n],enumerable:!(o=Si(t,n))||o.enumerable});return e};var Ht=(e,t,r)=>(r=e!=null?wi(Oi(e)):{},Mi(t||!e||!e.__esModule?ur(r,"default",{value:e,enumerable:!0}):r,e));var ro=hr((br,to)=>{(function(e,t){typeof br=="object"&&typeof to!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(br,function(){"use strict";function e(r){var o=!0,n=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(C){return!!(C&&C!==document&&C.nodeName!=="HTML"&&C.nodeName!=="BODY"&&"classList"in C&&"contains"in C.classList)}function c(C){var it=C.type,Ue=C.tagName;return!!(Ue==="INPUT"&&s[it]&&!C.readOnly||Ue==="TEXTAREA"&&!C.readOnly||C.isContentEditable)}function p(C){C.classList.contains("focus-visible")||(C.classList.add("focus-visible"),C.setAttribute("data-focus-visible-added",""))}function l(C){C.hasAttribute("data-focus-visible-added")&&(C.classList.remove("focus-visible"),C.removeAttribute("data-focus-visible-added"))}function f(C){C.metaKey||C.altKey||C.ctrlKey||(a(r.activeElement)&&p(r.activeElement),o=!0)}function u(C){o=!1}function d(C){a(C.target)&&(o||c(C.target))&&p(C.target)}function v(C){a(C.target)&&(C.target.classList.contains("focus-visible")||C.target.hasAttribute("data-focus-visible-added"))&&(n=!0,window.clearTimeout(i),i=window.setTimeout(function(){n=!1},100),l(C.target))}function b(C){document.visibilityState==="hidden"&&(n&&(o=!0),z())}function z(){document.addEventListener("mousemove",G),document.addEventListener("mousedown",G),document.addEventListener("mouseup",G),document.addEventListener("pointermove",G),document.addEventListener("pointerdown",G),document.addEventListener("pointerup",G),document.addEventListener("touchmove",G),document.addEventListener("touchstart",G),document.addEventListener("touchend",G)}function K(){document.removeEventListener("mousemove",G),document.removeEventListener("mousedown",G),document.removeEventListener("mouseup",G),document.removeEventListener("pointermove",G),document.removeEventListener("pointerdown",G),document.removeEventListener("pointerup",G),document.removeEventListener("touchmove",G),document.removeEventListener("touchstart",G),document.removeEventListener("touchend",G)}function G(C){C.target.nodeName&&C.target.nodeName.toLowerCase()==="html"||(o=!1,K())}document.addEventListener("keydown",f,!0),document.addEventListener("mousedown",u,!0),document.addEventListener("pointerdown",u,!0),document.addEventListener("touchstart",u,!0),document.addEventListener("visibilitychange",b,!0),z(),r.addEventListener("focus",d,!0),r.addEventListener("blur",v,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var Vr=hr((Ot,Dr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Ot=="object"&&typeof Dr=="object"?Dr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Ot=="object"?Ot.ClipboardJS=r():t.ClipboardJS=r()})(Ot,function(){return function(){var e={686:function(o,n,i){"use strict";i.d(n,{default:function(){return Ei}});var s=i(279),a=i.n(s),c=i(370),p=i.n(c),l=i(817),f=i.n(l);function u(W){try{return document.execCommand(W)}catch(O){return!1}}var d=function(O){var S=f()(O);return u("cut"),S},v=d;function b(W){var O=document.documentElement.getAttribute("dir")==="rtl",S=document.createElement("textarea");S.style.fontSize="12pt",S.style.border="0",S.style.padding="0",S.style.margin="0",S.style.position="absolute",S.style[O?"right":"left"]="-9999px";var $=window.pageYOffset||document.documentElement.scrollTop;return S.style.top="".concat($,"px"),S.setAttribute("readonly",""),S.value=W,S}var z=function(O,S){var $=b(O);S.container.appendChild($);var F=f()($);return u("copy"),$.remove(),F},K=function(O){var S=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},$="";return typeof O=="string"?$=z(O,S):O instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(O==null?void 0:O.type)?$=z(O.value,S):($=f()(O),u("copy")),$},G=K;function C(W){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?C=function(S){return typeof S}:C=function(S){return S&&typeof Symbol=="function"&&S.constructor===Symbol&&S!==Symbol.prototype?"symbol":typeof S},C(W)}var it=function(){var O=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},S=O.action,$=S===void 0?"copy":S,F=O.container,Q=O.target,_e=O.text;if($!=="copy"&&$!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(Q!==void 0)if(Q&&C(Q)==="object"&&Q.nodeType===1){if($==="copy"&&Q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if($==="cut"&&(Q.hasAttribute("readonly")||Q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(_e)return G(_e,{container:F});if(Q)return $==="cut"?v(Q):G(Q,{container:F})},Ue=it;function Pe(W){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Pe=function(S){return typeof S}:Pe=function(S){return S&&typeof Symbol=="function"&&S.constructor===Symbol&&S!==Symbol.prototype?"symbol":typeof S},Pe(W)}function ui(W,O){if(!(W instanceof O))throw new TypeError("Cannot call a class as a function")}function Jr(W,O){for(var S=0;S0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof F.action=="function"?F.action:this.defaultAction,this.target=typeof F.target=="function"?F.target:this.defaultTarget,this.text=typeof F.text=="function"?F.text:this.defaultText,this.container=Pe(F.container)==="object"?F.container:document.body}},{key:"listenClick",value:function(F){var Q=this;this.listener=p()(F,"click",function(_e){return Q.onClick(_e)})}},{key:"onClick",value:function(F){var Q=F.delegateTarget||F.currentTarget,_e=this.action(Q)||"copy",Ct=Ue({action:_e,container:this.container,target:this.target(Q),text:this.text(Q)});this.emit(Ct?"success":"error",{action:_e,text:Ct,trigger:Q,clearSelection:function(){Q&&Q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(F){return fr("action",F)}},{key:"defaultTarget",value:function(F){var Q=fr("target",F);if(Q)return document.querySelector(Q)}},{key:"defaultText",value:function(F){return fr("text",F)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(F){var Q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return G(F,Q)}},{key:"cut",value:function(F){return v(F)}},{key:"isSupported",value:function(){var F=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],Q=typeof F=="string"?[F]:F,_e=!!document.queryCommandSupported;return Q.forEach(function(Ct){_e=_e&&!!document.queryCommandSupported(Ct)}),_e}}]),S}(a()),Ei=yi},828:function(o){var n=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,c){for(;a&&a.nodeType!==n;){if(typeof a.matches=="function"&&a.matches(c))return a;a=a.parentNode}}o.exports=s},438:function(o,n,i){var s=i(828);function a(l,f,u,d,v){var b=p.apply(this,arguments);return l.addEventListener(u,b,v),{destroy:function(){l.removeEventListener(u,b,v)}}}function c(l,f,u,d,v){return typeof l.addEventListener=="function"?a.apply(null,arguments):typeof u=="function"?a.bind(null,document).apply(null,arguments):(typeof l=="string"&&(l=document.querySelectorAll(l)),Array.prototype.map.call(l,function(b){return a(b,f,u,d,v)}))}function p(l,f,u,d){return function(v){v.delegateTarget=s(v.target,f),v.delegateTarget&&d.call(l,v)}}o.exports=c},879:function(o,n){n.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},n.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||n.node(i[0]))},n.string=function(i){return typeof i=="string"||i instanceof String},n.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(o,n,i){var s=i(879),a=i(438);function c(u,d,v){if(!u&&!d&&!v)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(v))throw new TypeError("Third argument must be a Function");if(s.node(u))return p(u,d,v);if(s.nodeList(u))return l(u,d,v);if(s.string(u))return f(u,d,v);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function p(u,d,v){return u.addEventListener(d,v),{destroy:function(){u.removeEventListener(d,v)}}}function l(u,d,v){return Array.prototype.forEach.call(u,function(b){b.addEventListener(d,v)}),{destroy:function(){Array.prototype.forEach.call(u,function(b){b.removeEventListener(d,v)})}}}function f(u,d,v){return a(document.body,u,d,v)}o.exports=c},817:function(o){function n(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var c=window.getSelection(),p=document.createRange();p.selectNodeContents(i),c.removeAllRanges(),c.addRange(p),s=c.toString()}return s}o.exports=n},279:function(o){function n(){}n.prototype={on:function(i,s,a){var c=this.e||(this.e={});return(c[i]||(c[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var c=this;function p(){c.off(i,p),s.apply(a,arguments)}return p._=s,this.on(i,p,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),c=0,p=a.length;for(c;c{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var Ha=/["'&<>]/;Nn.exports=$a;function $a(e){var t=""+e,r=Ha.exec(t);if(!r)return t;var o,n="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(p[0]===6||p[0]===2)){r=0;continue}if(p[0]===3&&(!i||p[1]>i[0]&&p[1]=e.length&&(e=void 0),{value:e&&e[o++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function N(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var o=r.call(e),n,i=[],s;try{for(;(t===void 0||t-- >0)&&!(n=o.next()).done;)i.push(n.value)}catch(a){s={error:a}}finally{try{n&&!n.done&&(r=o.return)&&r.call(o)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var o=0,n=t.length,i;o1||a(u,d)})})}function a(u,d){try{c(o[u](d))}catch(v){f(i[0][3],v)}}function c(u){u.value instanceof Ze?Promise.resolve(u.value.v).then(p,l):f(i[0][2],u)}function p(u){a("next",u)}function l(u){a("throw",u)}function f(u,d){u(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function io(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof we=="function"?we(e):e[Symbol.iterator](),r={},o("next"),o("throw"),o("return"),r[Symbol.asyncIterator]=function(){return this},r);function o(i){r[i]=e[i]&&function(s){return new Promise(function(a,c){s=e[i](s),n(a,c,s.done,s.value)})}}function n(i,s,a,c){Promise.resolve(c).then(function(p){i({value:p,done:a})},s)}}function k(e){return typeof e=="function"}function at(e){var t=function(o){Error.call(o),o.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var Rt=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(o,n){return n+1+") "+o.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function De(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,o,n,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=we(s),c=a.next();!c.done;c=a.next()){var p=c.value;p.remove(this)}}catch(b){t={error:b}}finally{try{c&&!c.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var l=this.initialTeardown;if(k(l))try{l()}catch(b){i=b instanceof Rt?b.errors:[b]}var f=this._finalizers;if(f){this._finalizers=null;try{for(var u=we(f),d=u.next();!d.done;d=u.next()){var v=d.value;try{ao(v)}catch(b){i=i!=null?i:[],b instanceof Rt?i=D(D([],N(i)),N(b.errors)):i.push(b)}}}catch(b){o={error:b}}finally{try{d&&!d.done&&(n=u.return)&&n.call(u)}finally{if(o)throw o.error}}}if(i)throw new Rt(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ao(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&De(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&De(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var gr=Ie.EMPTY;function Pt(e){return e instanceof Ie||e&&"closed"in e&&k(e.remove)&&k(e.add)&&k(e.unsubscribe)}function ao(e){k(e)?e():e.unsubscribe()}var Ae={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],o=2;o0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var o=this,n=this,i=n.hasError,s=n.isStopped,a=n.observers;return i||s?gr:(this.currentObservers=null,a.push(r),new Ie(function(){o.currentObservers=null,De(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var o=this,n=o.hasError,i=o.thrownError,s=o.isStopped;n?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new P;return r.source=this,r},t.create=function(r,o){return new ho(r,o)},t}(P);var ho=function(e){ie(t,e);function t(r,o){var n=e.call(this)||this;return n.destination=r,n.source=o,n}return t.prototype.next=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.next)===null||n===void 0||n.call(o,r)},t.prototype.error=function(r){var o,n;(n=(o=this.destination)===null||o===void 0?void 0:o.error)===null||n===void 0||n.call(o,r)},t.prototype.complete=function(){var r,o;(o=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||o===void 0||o.call(r)},t.prototype._subscribe=function(r){var o,n;return(n=(o=this.source)===null||o===void 0?void 0:o.subscribe(r))!==null&&n!==void 0?n:gr},t}(x);var yt={now:function(){return(yt.delegate||Date).now()},delegate:void 0};var Et=function(e){ie(t,e);function t(r,o,n){r===void 0&&(r=1/0),o===void 0&&(o=1/0),n===void 0&&(n=yt);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=o,i._timestampProvider=n,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=o===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,o),i}return t.prototype.next=function(r){var o=this,n=o.isStopped,i=o._buffer,s=o._infiniteTimeWindow,a=o._timestampProvider,c=o._windowTime;n||(i.push(r),!s&&i.push(a.now()+c)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var o=this._innerSubscribe(r),n=this,i=n._infiniteTimeWindow,s=n._buffer,a=s.slice(),c=0;c0?e.prototype.requestAsyncId.call(this,r,o,n):(r.actions.push(this),r._scheduled||(r._scheduled=lt.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,o,n){var i;if(n===void 0&&(n=0),n!=null?n>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,o,n);var s=r.actions;o!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==o&&(lt.cancelAnimationFrame(o),r._scheduled=void 0)},t}(jt);var go=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var o=this._scheduled;this._scheduled=void 0;var n=this.actions,i;r=r||n.shift();do if(i=r.execute(r.state,r.delay))break;while((r=n[0])&&r.id===o&&n.shift());if(this._active=!1,i){for(;(r=n[0])&&r.id===o&&n.shift();)r.unsubscribe();throw i}},t}(Wt);var Oe=new go(vo);var L=new P(function(e){return e.complete()});function Nt(e){return e&&k(e.schedule)}function Or(e){return e[e.length-1]}function Qe(e){return k(Or(e))?e.pop():void 0}function Me(e){return Nt(Or(e))?e.pop():void 0}function Ut(e,t){return typeof Or(e)=="number"?e.pop():t}var mt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Dt(e){return k(e==null?void 0:e.then)}function Vt(e){return k(e[pt])}function zt(e){return Symbol.asyncIterator&&k(e==null?void 0:e[Symbol.asyncIterator])}function qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Pi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Kt=Pi();function Qt(e){return k(e==null?void 0:e[Kt])}function Yt(e){return no(this,arguments,function(){var r,o,n,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,Ze(r.read())];case 3:return o=s.sent(),n=o.value,i=o.done,i?[4,Ze(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,Ze(n)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Bt(e){return k(e==null?void 0:e.getReader)}function I(e){if(e instanceof P)return e;if(e!=null){if(Vt(e))return Ii(e);if(mt(e))return Fi(e);if(Dt(e))return ji(e);if(zt(e))return xo(e);if(Qt(e))return Wi(e);if(Bt(e))return Ni(e)}throw qt(e)}function Ii(e){return new P(function(t){var r=e[pt]();if(k(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function Fi(e){return new P(function(t){for(var r=0;r=2;return function(o){return o.pipe(e?M(function(n,i){return e(n,i,o)}):ue,xe(1),r?He(t):Io(function(){return new Jt}))}}function Fo(){for(var e=[],t=0;t=2,!0))}function le(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,o=e.resetOnError,n=o===void 0?!0:o,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,c=a===void 0?!0:a;return function(p){var l,f,u,d=0,v=!1,b=!1,z=function(){f==null||f.unsubscribe(),f=void 0},K=function(){z(),l=u=void 0,v=b=!1},G=function(){var C=l;K(),C==null||C.unsubscribe()};return g(function(C,it){d++,!b&&!v&&z();var Ue=u=u!=null?u:r();it.add(function(){d--,d===0&&!b&&!v&&(f=Hr(G,c))}),Ue.subscribe(it),!l&&d>0&&(l=new tt({next:function(Pe){return Ue.next(Pe)},error:function(Pe){b=!0,z(),f=Hr(K,n,Pe),Ue.error(Pe)},complete:function(){v=!0,z(),f=Hr(K,s),Ue.complete()}}),I(C).subscribe(l))})(p)}}function Hr(e,t){for(var r=[],o=2;oe.next(document)),e}function q(e,t=document){return Array.from(t.querySelectorAll(e))}function U(e,t=document){let r=se(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function se(e,t=document){return t.querySelector(e)||void 0}function Re(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}var na=_(h(document.body,"focusin"),h(document.body,"focusout")).pipe(ke(1),V(void 0),m(()=>Re()||document.body),J(1));function Zt(e){return na.pipe(m(t=>e.contains(t)),X())}function Je(e){return{x:e.offsetLeft,y:e.offsetTop}}function Uo(e){return _(h(window,"load"),h(window,"resize")).pipe(Ce(0,Oe),m(()=>Je(e)),V(Je(e)))}function er(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return _(h(e,"scroll"),h(window,"resize")).pipe(Ce(0,Oe),m(()=>er(e)),V(er(e)))}function Do(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)Do(e,r)}function T(e,t,...r){let o=document.createElement(e);if(t)for(let n of Object.keys(t))typeof t[n]!="undefined"&&(typeof t[n]!="boolean"?o.setAttribute(n,t[n]):o.setAttribute(n,""));for(let n of r)Do(o,n);return o}function tr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function ht(e){let t=T("script",{src:e});return H(()=>(document.head.appendChild(t),_(h(t,"load"),h(t,"error").pipe(E(()=>Mr(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(m(()=>{}),A(()=>document.head.removeChild(t)),xe(1))))}var Vo=new x,ia=H(()=>typeof ResizeObserver=="undefined"?ht("https://unpkg.com/resize-observer-polyfill"):j(void 0)).pipe(m(()=>new ResizeObserver(e=>{for(let t of e)Vo.next(t)})),E(e=>_(Ve,j(e)).pipe(A(()=>e.disconnect()))),J(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return ia.pipe(w(t=>t.observe(e)),E(t=>Vo.pipe(M(({target:r})=>r===e),A(()=>t.unobserve(e)),m(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function zo(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var qo=new x,aa=H(()=>j(new IntersectionObserver(e=>{for(let t of e)qo.next(t)},{threshold:0}))).pipe(E(e=>_(Ve,j(e)).pipe(A(()=>e.disconnect()))),J(1));function rr(e){return aa.pipe(w(t=>t.observe(e)),E(t=>qo.pipe(M(({target:r})=>r===e),A(()=>t.unobserve(e)),m(({isIntersecting:r})=>r))))}function Ko(e,t=16){return dt(e).pipe(m(({y:r})=>{let o=he(e),n=bt(e);return r>=n.height-o.height-t}),X())}var or={drawer:U("[data-md-toggle=drawer]"),search:U("[data-md-toggle=search]")};function Qo(e){return or[e].checked}function Ke(e,t){or[e].checked!==t&&or[e].click()}function We(e){let t=or[e];return h(t,"change").pipe(m(()=>t.checked),V(t.checked))}function sa(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function ca(){return _(h(window,"compositionstart").pipe(m(()=>!0)),h(window,"compositionend").pipe(m(()=>!1))).pipe(V(!1))}function Yo(){let e=h(window,"keydown").pipe(M(t=>!(t.metaKey||t.ctrlKey)),m(t=>({mode:Qo("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),M(({mode:t,type:r})=>{if(t==="global"){let o=Re();if(typeof o!="undefined")return!sa(o,r)}return!0}),le());return ca().pipe(E(t=>t?L:e))}function pe(){return new URL(location.href)}function ot(e,t=!1){if(te("navigation.instant")&&!t){let r=T("a",{href:e.href});document.body.appendChild(r),r.click(),r.remove()}else location.href=e.href}function Bo(){return new x}function Go(){return location.hash.slice(1)}function nr(e){let t=T("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function pa(e){return _(h(window,"hashchange"),e).pipe(m(Go),V(Go()),M(t=>t.length>0),J(1))}function Jo(e){return pa(e).pipe(m(t=>se(`[id="${t}"]`)),M(t=>typeof t!="undefined"))}function Fr(e){let t=matchMedia(e);return Xt(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function Xo(){let e=matchMedia("print");return _(h(window,"beforeprint").pipe(m(()=>!0)),h(window,"afterprint").pipe(m(()=>!1))).pipe(V(e.matches))}function jr(e,t){return e.pipe(E(r=>r?t():L))}function ir(e,t){return new P(r=>{let o=new XMLHttpRequest;o.open("GET",`${e}`),o.responseType="blob",o.addEventListener("load",()=>{o.status>=200&&o.status<300?(r.next(o.response),r.complete()):r.error(new Error(o.statusText))}),o.addEventListener("error",()=>{r.error(new Error("Network Error"))}),o.addEventListener("abort",()=>{r.error(new Error("Request aborted"))}),typeof(t==null?void 0:t.progress$)!="undefined"&&(o.addEventListener("progress",n=>{if(n.lengthComputable)t.progress$.next(n.loaded/n.total*100);else{let i=Number(o.getResponseHeader("Content-Length"))||0;t.progress$.next(n.loaded/i*100)}}),t.progress$.next(5)),o.send()})}function Ne(e,t){return ir(e,t).pipe(E(r=>r.text()),m(r=>JSON.parse(r)),J(1))}function Zo(e,t){let r=new DOMParser;return ir(e,t).pipe(E(o=>o.text()),m(o=>r.parseFromString(o,"text/xml")),J(1))}function en(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function tn(){return _(h(window,"scroll",{passive:!0}),h(window,"resize",{passive:!0})).pipe(m(en),V(en()))}function rn(){return{width:innerWidth,height:innerHeight}}function on(){return h(window,"resize",{passive:!0}).pipe(m(rn),V(rn()))}function nn(){return B([tn(),on()]).pipe(m(([e,t])=>({offset:e,size:t})),J(1))}function ar(e,{viewport$:t,header$:r}){let o=t.pipe(ee("size")),n=B([o,r]).pipe(m(()=>Je(e)));return B([r,t,n]).pipe(m(([{height:i},{offset:s,size:a},{x:c,y:p}])=>({offset:{x:s.x-c,y:s.y-p+i},size:a})))}function la(e){return h(e,"message",t=>t.data)}function ma(e){let t=new x;return t.subscribe(r=>e.postMessage(r)),t}function an(e,t=new Worker(e)){let r=la(t),o=ma(t),n=new x;n.subscribe(o);let i=o.pipe(Z(),re(!0));return n.pipe(Z(),qe(r.pipe(Y(i))),le())}var fa=U("#__config"),vt=JSON.parse(fa.textContent);vt.base=`${new URL(vt.base,pe())}`;function me(){return vt}function te(e){return vt.features.includes(e)}function be(e,t){return typeof t!="undefined"?vt.translations[e].replace("#",t.toString()):vt.translations[e]}function Ee(e,t=document){return U(`[data-md-component=${e}]`,t)}function oe(e,t=document){return q(`[data-md-component=${e}]`,t)}function ua(e){let t=U(".md-typeset > :first-child",e);return h(t,"click",{once:!0}).pipe(m(()=>U(".md-typeset",e)),m(r=>({hash:__md_hash(r.innerHTML)})))}function sn(e){if(!te("announce.dismiss")||!e.childElementCount)return L;if(!e.hidden){let t=U(".md-typeset",e);__md_hash(t.innerHTML)===__md_get("__announce")&&(e.hidden=!0)}return H(()=>{let t=new x;return t.subscribe(({hash:r})=>{e.hidden=!0,__md_set("__announce",r)}),ua(e).pipe(w(r=>t.next(r)),A(()=>t.complete()),m(r=>R({ref:e},r)))})}function da(e,{target$:t}){return t.pipe(m(r=>({hidden:r!==e})))}function cn(e,t){let r=new x;return r.subscribe(({hidden:o})=>{e.hidden=o}),da(e,t).pipe(w(o=>r.next(o)),A(()=>r.complete()),m(o=>R({ref:e},o)))}function ha(e,t){let r=H(()=>B([Uo(e),dt(t)])).pipe(m(([{x:o,y:n},i])=>{let{width:s,height:a}=he(e);return{x:o-i.x+s/2,y:n-i.y+a/2}}));return Zt(e).pipe(E(o=>r.pipe(m(n=>({active:o,offset:n})),xe(+!o||1/0))))}function pn(e,t,{target$:r}){let[o,n]=Array.from(e.children);return H(()=>{let i=new x,s=i.pipe(Z(),re(!0));return i.subscribe({next({offset:a}){e.style.setProperty("--md-tooltip-x",`${a.x}px`),e.style.setProperty("--md-tooltip-y",`${a.y}px`)},complete(){e.style.removeProperty("--md-tooltip-x"),e.style.removeProperty("--md-tooltip-y")}}),rr(e).pipe(Y(s)).subscribe(a=>{e.toggleAttribute("data-md-visible",a)}),_(i.pipe(M(({active:a})=>a)),i.pipe(ke(250),M(({active:a})=>!a))).subscribe({next({active:a}){a?e.prepend(o):o.remove()},complete(){e.prepend(o)}}),i.pipe(Ce(16,Oe)).subscribe(({active:a})=>{o.classList.toggle("md-tooltip--active",a)}),i.pipe(Pr(125,Oe),M(()=>!!e.offsetParent),m(()=>e.offsetParent.getBoundingClientRect()),m(({x:a})=>a)).subscribe({next(a){a?e.style.setProperty("--md-tooltip-0",`${-a}px`):e.style.removeProperty("--md-tooltip-0")},complete(){e.style.removeProperty("--md-tooltip-0")}}),h(n,"click").pipe(Y(s),M(a=>!(a.metaKey||a.ctrlKey))).subscribe(a=>{a.stopPropagation(),a.preventDefault()}),h(n,"mousedown").pipe(Y(s),ne(i)).subscribe(([a,{active:c}])=>{var p;if(a.button!==0||a.metaKey||a.ctrlKey)a.preventDefault();else if(c){a.preventDefault();let l=e.parentElement.closest(".md-annotation");l instanceof HTMLElement?l.focus():(p=Re())==null||p.blur()}}),r.pipe(Y(s),M(a=>a===o),ze(125)).subscribe(()=>e.focus()),ha(e,t).pipe(w(a=>i.next(a)),A(()=>i.complete()),m(a=>R({ref:e},a)))})}function Wr(e){return T("div",{class:"md-tooltip",id:e},T("div",{class:"md-tooltip__inner md-typeset"}))}function ln(e,t){if(t=t?`${t}_annotation_${e}`:void 0,t){let r=t?`#${t}`:void 0;return T("aside",{class:"md-annotation",tabIndex:0},Wr(t),T("a",{href:r,class:"md-annotation__index",tabIndex:-1},T("span",{"data-md-annotation-id":e})))}else return T("aside",{class:"md-annotation",tabIndex:0},Wr(t),T("span",{class:"md-annotation__index",tabIndex:-1},T("span",{"data-md-annotation-id":e})))}function mn(e){return T("button",{class:"md-clipboard md-icon",title:be("clipboard.copy"),"data-clipboard-target":`#${e} > code`})}function Nr(e,t){let r=t&2,o=t&1,n=Object.keys(e.terms).filter(c=>!e.terms[c]).reduce((c,p)=>[...c,T("del",null,p)," "],[]).slice(0,-1),i=me(),s=new URL(e.location,i.base);te("search.highlight")&&s.searchParams.set("h",Object.entries(e.terms).filter(([,c])=>c).reduce((c,[p])=>`${c} ${p}`.trim(),""));let{tags:a}=me();return T("a",{href:`${s}`,class:"md-search-result__link",tabIndex:-1},T("article",{class:"md-search-result__article md-typeset","data-md-score":e.score.toFixed(2)},r>0&&T("div",{class:"md-search-result__icon md-icon"}),r>0&&T("h1",null,e.title),r<=0&&T("h2",null,e.title),o>0&&e.text.length>0&&e.text,e.tags&&e.tags.map(c=>{let p=a?c in a?`md-tag-icon md-tag--${a[c]}`:"md-tag-icon":"";return T("span",{class:`md-tag ${p}`},c)}),o>0&&n.length>0&&T("p",{class:"md-search-result__terms"},be("search.result.term.missing"),": ",...n)))}function fn(e){let t=e[0].score,r=[...e],o=me(),n=r.findIndex(l=>!`${new URL(l.location,o.base)}`.includes("#")),[i]=r.splice(n,1),s=r.findIndex(l=>l.scoreNr(l,1)),...c.length?[T("details",{class:"md-search-result__more"},T("summary",{tabIndex:-1},T("div",null,c.length>0&&c.length===1?be("search.result.more.one"):be("search.result.more.other",c.length))),...c.map(l=>Nr(l,1)))]:[]];return T("li",{class:"md-search-result__item"},p)}function un(e){return T("ul",{class:"md-source__facts"},Object.entries(e).map(([t,r])=>T("li",{class:`md-source__fact md-source__fact--${t}`},typeof r=="number"?tr(r):r)))}function Ur(e){let t=`tabbed-control tabbed-control--${e}`;return T("div",{class:t,hidden:!0},T("button",{class:"tabbed-button",tabIndex:-1,"aria-hidden":"true"}))}function dn(e){return T("div",{class:"md-typeset__scrollwrap"},T("div",{class:"md-typeset__table"},e))}function ba(e){let t=me(),r=new URL(`../${e.version}/`,t.base);return T("li",{class:"md-version__item"},T("a",{href:`${r}`,class:"md-version__link"},e.title))}function hn(e,t){return T("div",{class:"md-version"},T("button",{class:"md-version__current","aria-label":be("select.version")},t.title),T("ul",{class:"md-version__list"},e.map(ba)))}function va(e){return e.tagName==="CODE"?q(".c, .c1, .cm",e):[e]}function ga(e){let t=[];for(let r of va(e)){let o=[],n=document.createNodeIterator(r,NodeFilter.SHOW_TEXT);for(let i=n.nextNode();i;i=n.nextNode())o.push(i);for(let i of o){let s;for(;s=/(\(\d+\))(!)?/.exec(i.textContent);){let[,a,c]=s;if(typeof c=="undefined"){let p=i.splitText(s.index);i=p.splitText(a.length),t.push(p)}else{i.textContent=a,t.push(i);break}}}}return t}function bn(e,t){t.append(...Array.from(e.childNodes))}function sr(e,t,{target$:r,print$:o}){let n=t.closest("[id]"),i=n==null?void 0:n.id,s=new Map;for(let a of ga(t)){let[,c]=a.textContent.match(/\((\d+)\)/);se(`:scope > li:nth-child(${c})`,e)&&(s.set(c,ln(c,i)),a.replaceWith(s.get(c)))}return s.size===0?L:H(()=>{let a=new x,c=a.pipe(Z(),re(!0)),p=[];for(let[l,f]of s)p.push([U(".md-typeset",f),U(`:scope > li:nth-child(${l})`,e)]);return o.pipe(Y(c)).subscribe(l=>{e.hidden=!l,e.classList.toggle("md-annotation-list",l);for(let[f,u]of p)l?bn(f,u):bn(u,f)}),_(...[...s].map(([,l])=>pn(l,t,{target$:r}))).pipe(A(()=>a.complete()),le())})}function vn(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return vn(t)}}function gn(e,t){return H(()=>{let r=vn(e);return typeof r!="undefined"?sr(r,e,t):L})}var yn=Ht(Vr());var xa=0;function En(e){if(e.nextElementSibling){let t=e.nextElementSibling;if(t.tagName==="OL")return t;if(t.tagName==="P"&&!t.children.length)return En(t)}}function xn(e){return ye(e).pipe(m(({width:t})=>({scrollable:bt(e).width>t})),ee("scrollable"))}function wn(e,t){let{matches:r}=matchMedia("(hover)"),o=H(()=>{let n=new x;if(n.subscribe(({scrollable:s})=>{s&&r?e.setAttribute("tabindex","0"):e.removeAttribute("tabindex")}),yn.default.isSupported()&&(e.closest(".copy")||te("content.code.copy")&&!e.closest(".no-copy"))){let s=e.closest("pre");s.id=`__code_${xa++}`,s.insertBefore(mn(s.id),e)}let i=e.closest(".highlight");if(i instanceof HTMLElement){let s=En(i);if(typeof s!="undefined"&&(i.classList.contains("annotate")||te("content.code.annotate"))){let a=sr(s,e,t);return xn(e).pipe(w(c=>n.next(c)),A(()=>n.complete()),m(c=>R({ref:e},c)),qe(ye(i).pipe(m(({width:c,height:p})=>c&&p),X(),E(c=>c?a:L))))}}return xn(e).pipe(w(s=>n.next(s)),A(()=>n.complete()),m(s=>R({ref:e},s)))});return te("content.lazy")?rr(e).pipe(M(n=>n),xe(1),E(()=>o)):o}function ya(e,{target$:t,print$:r}){let o=!0;return _(t.pipe(m(n=>n.closest("details:not([open])")),M(n=>e===n),m(()=>({action:"open",reveal:!0}))),r.pipe(M(n=>n||!o),w(()=>o=e.open),m(n=>({action:n?"open":"close"}))))}function Sn(e,t){return H(()=>{let r=new x;return r.subscribe(({action:o,reveal:n})=>{e.toggleAttribute("open",o==="open"),n&&e.scrollIntoView()}),ya(e,t).pipe(w(o=>r.next(o)),A(()=>r.complete()),m(o=>R({ref:e},o)))})}var Tn=".node circle,.node ellipse,.node path,.node polygon,.node rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}marker{fill:var(--md-mermaid-edge-color)!important}.edgeLabel .label rect{fill:#0000}.label{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.label foreignObject{line-height:normal;overflow:visible}.label div .edgeLabel{color:var(--md-mermaid-label-fg-color)}.edgeLabel,.edgeLabel rect,.label div .edgeLabel{background-color:var(--md-mermaid-label-bg-color)}.edgeLabel,.edgeLabel rect{fill:var(--md-mermaid-label-bg-color);color:var(--md-mermaid-edge-color)}.edgePath .path,.flowchart-link{stroke:var(--md-mermaid-edge-color);stroke-width:.05rem}.edgePath .arrowheadPath{fill:var(--md-mermaid-edge-color);stroke:none}.cluster rect{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}.cluster span{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}g #flowchart-circleEnd,g #flowchart-circleStart,g #flowchart-crossEnd,g #flowchart-crossStart,g #flowchart-pointEnd,g #flowchart-pointStart{stroke:none}g.classGroup line,g.classGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.classGroup text{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.classLabel .box{fill:var(--md-mermaid-label-bg-color);background-color:var(--md-mermaid-label-bg-color);opacity:1}.classLabel .label{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node .divider{stroke:var(--md-mermaid-node-fg-color)}.relation{stroke:var(--md-mermaid-edge-color)}.cardinality{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.cardinality text{fill:inherit!important}defs #classDiagram-compositionEnd,defs #classDiagram-compositionStart,defs #classDiagram-dependencyEnd,defs #classDiagram-dependencyStart,defs #classDiagram-extensionEnd,defs #classDiagram-extensionStart{fill:var(--md-mermaid-edge-color)!important;stroke:var(--md-mermaid-edge-color)!important}defs #classDiagram-aggregationEnd,defs #classDiagram-aggregationStart{fill:var(--md-mermaid-label-bg-color)!important;stroke:var(--md-mermaid-edge-color)!important}g.stateGroup rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}g.stateGroup .state-title{fill:var(--md-mermaid-label-fg-color)!important;font-family:var(--md-mermaid-font-family)}g.stateGroup .composit{fill:var(--md-mermaid-label-bg-color)}.nodeLabel{color:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.node circle.state-end,.node circle.state-start,.start-state{fill:var(--md-mermaid-edge-color);stroke:none}.end-state-inner,.end-state-outer{fill:var(--md-mermaid-edge-color)}.end-state-inner,.node circle.state-end{stroke:var(--md-mermaid-label-bg-color)}.transition{stroke:var(--md-mermaid-edge-color)}[id^=state-fork] rect,[id^=state-join] rect{fill:var(--md-mermaid-edge-color)!important;stroke:none!important}.statediagram-cluster.statediagram-cluster .inner{fill:var(--md-default-bg-color)}.statediagram-cluster rect{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.statediagram-state rect.divider{fill:var(--md-default-fg-color--lightest);stroke:var(--md-default-fg-color--lighter)}defs #statediagram-barbEnd{stroke:var(--md-mermaid-edge-color)}.attributeBoxEven,.attributeBoxOdd{fill:var(--md-mermaid-node-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityBox{fill:var(--md-mermaid-label-bg-color);stroke:var(--md-mermaid-node-fg-color)}.entityLabel{fill:var(--md-mermaid-label-fg-color);font-family:var(--md-mermaid-font-family)}.relationshipLabelBox{fill:var(--md-mermaid-label-bg-color);fill-opacity:1;background-color:var(--md-mermaid-label-bg-color);opacity:1}.relationshipLabel{fill:var(--md-mermaid-label-fg-color)}.relationshipLine{stroke:var(--md-mermaid-edge-color)}defs #ONE_OR_MORE_END *,defs #ONE_OR_MORE_START *,defs #ONLY_ONE_END *,defs #ONLY_ONE_START *,defs #ZERO_OR_MORE_END *,defs #ZERO_OR_MORE_START *,defs #ZERO_OR_ONE_END *,defs #ZERO_OR_ONE_START *{stroke:var(--md-mermaid-edge-color)!important}defs #ZERO_OR_MORE_END circle,defs #ZERO_OR_MORE_START circle{fill:var(--md-mermaid-label-bg-color)}.actor{fill:var(--md-mermaid-sequence-actor-bg-color);stroke:var(--md-mermaid-sequence-actor-border-color)}text.actor>tspan{fill:var(--md-mermaid-sequence-actor-fg-color);font-family:var(--md-mermaid-font-family)}line{stroke:var(--md-mermaid-sequence-actor-line-color)}.actor-man circle,.actor-man line{fill:var(--md-mermaid-sequence-actorman-bg-color);stroke:var(--md-mermaid-sequence-actorman-line-color)}.messageLine0,.messageLine1{stroke:var(--md-mermaid-sequence-message-line-color)}.note{fill:var(--md-mermaid-sequence-note-bg-color);stroke:var(--md-mermaid-sequence-note-border-color)}.loopText,.loopText>tspan,.messageText,.noteText>tspan{stroke:none;font-family:var(--md-mermaid-font-family)!important}.messageText{fill:var(--md-mermaid-sequence-message-fg-color)}.loopText,.loopText>tspan{fill:var(--md-mermaid-sequence-loop-fg-color)}.noteText>tspan{fill:var(--md-mermaid-sequence-note-fg-color)}#arrowhead path{fill:var(--md-mermaid-sequence-message-line-color);stroke:none}.loopLine{fill:var(--md-mermaid-sequence-loop-bg-color);stroke:var(--md-mermaid-sequence-loop-border-color)}.labelBox{fill:var(--md-mermaid-sequence-label-bg-color);stroke:none}.labelText,.labelText>span{fill:var(--md-mermaid-sequence-label-fg-color);font-family:var(--md-mermaid-font-family)}.sequenceNumber{fill:var(--md-mermaid-sequence-number-fg-color)}rect.rect{fill:var(--md-mermaid-sequence-box-bg-color);stroke:none}rect.rect+text.text{fill:var(--md-mermaid-sequence-box-fg-color)}defs #sequencenumber{fill:var(--md-mermaid-sequence-number-bg-color)!important}";var zr,wa=0;function Sa(){return typeof mermaid=="undefined"||mermaid instanceof Element?ht("https://unpkg.com/mermaid@9.4.3/dist/mermaid.min.js"):j(void 0)}function On(e){return e.classList.remove("mermaid"),zr||(zr=Sa().pipe(w(()=>mermaid.initialize({startOnLoad:!1,themeCSS:Tn,sequence:{actorFontSize:"16px",messageFontSize:"16px",noteFontSize:"16px"}})),m(()=>{}),J(1))),zr.subscribe(()=>{e.classList.add("mermaid");let t=`__mermaid_${wa++}`,r=T("div",{class:"mermaid"}),o=e.textContent;mermaid.mermaidAPI.render(t,o,(n,i)=>{let s=r.attachShadow({mode:"closed"});s.innerHTML=n,e.replaceWith(r),i==null||i(s)})}),zr.pipe(m(()=>({ref:e})))}var Mn=T("table");function Ln(e){return e.replaceWith(Mn),Mn.replaceWith(dn(e)),j({ref:e})}function Ta(e){let t=q(":scope > input",e),r=t.find(o=>o.checked)||t[0];return _(...t.map(o=>h(o,"change").pipe(m(()=>U(`label[for="${o.id}"]`))))).pipe(V(U(`label[for="${r.id}"]`)),m(o=>({active:o})))}function _n(e,{viewport$:t}){let r=Ur("prev");e.append(r);let o=Ur("next");e.append(o);let n=U(".tabbed-labels",e);return H(()=>{let i=new x,s=i.pipe(Z(),re(!0));return B([i,ye(e)]).pipe(Ce(1,Oe),Y(s)).subscribe({next([{active:a},c]){let p=Je(a),{width:l}=he(a);e.style.setProperty("--md-indicator-x",`${p.x}px`),e.style.setProperty("--md-indicator-width",`${l}px`);let f=er(n);(p.xf.x+c.width)&&n.scrollTo({left:Math.max(0,p.x-16),behavior:"smooth"})},complete(){e.style.removeProperty("--md-indicator-x"),e.style.removeProperty("--md-indicator-width")}}),B([dt(n),ye(n)]).pipe(Y(s)).subscribe(([a,c])=>{let p=bt(n);r.hidden=a.x<16,o.hidden=a.x>p.width-c.width-16}),_(h(r,"click").pipe(m(()=>-1)),h(o,"click").pipe(m(()=>1))).pipe(Y(s)).subscribe(a=>{let{width:c}=he(n);n.scrollBy({left:c*a,behavior:"smooth"})}),te("content.tabs.link")&&i.pipe(je(1),ne(t)).subscribe(([{active:a},{offset:c}])=>{let p=a.innerText.trim();if(a.hasAttribute("data-md-switching"))a.removeAttribute("data-md-switching");else{let l=e.offsetTop-c.y;for(let u of q("[data-tabs]"))for(let d of q(":scope > input",u)){let v=U(`label[for="${d.id}"]`);if(v!==a&&v.innerText.trim()===p){v.setAttribute("data-md-switching",""),d.click();break}}window.scrollTo({top:e.offsetTop-l});let f=__md_get("__tabs")||[];__md_set("__tabs",[...new Set([p,...f])])}}),i.pipe(Y(s)).subscribe(()=>{for(let a of q("audio, video",e))a.pause()}),Ta(e).pipe(w(a=>i.next(a)),A(()=>i.complete()),m(a=>R({ref:e},a)))}).pipe(rt(ae))}function An(e,{viewport$:t,target$:r,print$:o}){return _(...q(".annotate:not(.highlight)",e).map(n=>gn(n,{target$:r,print$:o})),...q("pre:not(.mermaid) > code",e).map(n=>wn(n,{target$:r,print$:o})),...q("pre.mermaid",e).map(n=>On(n)),...q("table:not([class])",e).map(n=>Ln(n)),...q("details",e).map(n=>Sn(n,{target$:r,print$:o})),...q("[data-tabs]",e).map(n=>_n(n,{viewport$:t})))}function Oa(e,{alert$:t}){return t.pipe(E(r=>_(j(!0),j(!1).pipe(ze(2e3))).pipe(m(o=>({message:r,active:o})))))}function Cn(e,t){let r=U(".md-typeset",e);return H(()=>{let o=new x;return o.subscribe(({message:n,active:i})=>{e.classList.toggle("md-dialog--active",i),r.textContent=n}),Oa(e,t).pipe(w(n=>o.next(n)),A(()=>o.complete()),m(n=>R({ref:e},n)))})}function Ma({viewport$:e}){if(!te("header.autohide"))return j(!1);let t=e.pipe(m(({offset:{y:n}})=>n),Le(2,1),m(([n,i])=>[nMath.abs(i-n.y)>100),m(([,[n]])=>n),X()),o=We("search");return B([e,o]).pipe(m(([{offset:n},i])=>n.y>400&&!i),X(),E(n=>n?r:j(!1)),V(!1))}function kn(e,t){return H(()=>B([ye(e),Ma(t)])).pipe(m(([{height:r},o])=>({height:r,hidden:o})),X((r,o)=>r.height===o.height&&r.hidden===o.hidden),J(1))}function Hn(e,{header$:t,main$:r}){return H(()=>{let o=new x,n=o.pipe(Z(),re(!0));return o.pipe(ee("active"),Ge(t)).subscribe(([{active:i},{hidden:s}])=>{e.classList.toggle("md-header--shadow",i&&!s),e.hidden=s}),r.subscribe(o),t.pipe(Y(n),m(i=>R({ref:e},i)))})}function La(e,{viewport$:t,header$:r}){return ar(e,{viewport$:t,header$:r}).pipe(m(({offset:{y:o}})=>{let{height:n}=he(e);return{active:o>=n}}),ee("active"))}function $n(e,t){return H(()=>{let r=new x;r.subscribe({next({active:n}){e.classList.toggle("md-header__title--active",n)},complete(){e.classList.remove("md-header__title--active")}});let o=se(".md-content h1");return typeof o=="undefined"?L:La(o,t).pipe(w(n=>r.next(n)),A(()=>r.complete()),m(n=>R({ref:e},n)))})}function Rn(e,{viewport$:t,header$:r}){let o=r.pipe(m(({height:i})=>i),X()),n=o.pipe(E(()=>ye(e).pipe(m(({height:i})=>({top:e.offsetTop,bottom:e.offsetTop+i})),ee("bottom"))));return B([o,n,t]).pipe(m(([i,{top:s,bottom:a},{offset:{y:c},size:{height:p}}])=>(p=Math.max(0,p-Math.max(0,s-c,i)-Math.max(0,p+c-a)),{offset:s-i,height:p,active:s-i<=c})),X((i,s)=>i.offset===s.offset&&i.height===s.height&&i.active===s.active))}function _a(e){let t=__md_get("__palette")||{index:e.findIndex(r=>matchMedia(r.getAttribute("data-md-color-media")).matches)};return j(...e).pipe(ce(r=>h(r,"change").pipe(m(()=>r))),V(e[Math.max(0,t.index)]),m(r=>({index:e.indexOf(r),color:{scheme:r.getAttribute("data-md-color-scheme"),primary:r.getAttribute("data-md-color-primary"),accent:r.getAttribute("data-md-color-accent")}})),J(1))}function Pn(e){let t=T("meta",{name:"theme-color"});document.head.appendChild(t);let r=T("meta",{name:"color-scheme"});return document.head.appendChild(r),H(()=>{let o=new x;o.subscribe(i=>{document.body.setAttribute("data-md-color-switching","");for(let[s,a]of Object.entries(i.color))document.body.setAttribute(`data-md-color-${s}`,a);for(let s=0;s{let i=Ee("header"),s=window.getComputedStyle(i);return r.content=s.colorScheme,s.backgroundColor.match(/\d+/g).map(a=>(+a).toString(16).padStart(2,"0")).join("")})).subscribe(i=>t.content=`#${i}`),o.pipe(Se(ae)).subscribe(()=>{document.body.removeAttribute("data-md-color-switching")});let n=q("input",e);return _a(n).pipe(w(i=>o.next(i)),A(()=>o.complete()),m(i=>R({ref:e},i)))})}function In(e,{progress$:t}){return H(()=>{let r=new x;return r.subscribe(({value:o})=>{e.style.setProperty("--md-progress-value",`${o}`)}),t.pipe(w(o=>r.next({value:o})),A(()=>r.complete()),m(o=>({ref:e,value:o})))})}var qr=Ht(Vr());function Aa(e){e.setAttribute("data-md-copying","");let t=e.closest("[data-copy]"),r=t?t.getAttribute("data-copy"):e.innerText;return e.removeAttribute("data-md-copying"),r.trimEnd()}function Fn({alert$:e}){qr.default.isSupported()&&new P(t=>{new qr.default("[data-clipboard-target], [data-clipboard-text]",{text:r=>r.getAttribute("data-clipboard-text")||Aa(U(r.getAttribute("data-clipboard-target")))}).on("success",r=>t.next(r))}).pipe(w(t=>{t.trigger.focus()}),m(()=>be("clipboard.copied"))).subscribe(e)}function Ca(e){if(e.length<2)return[""];let[t,r]=[...e].sort((n,i)=>n.length-i.length).map(n=>n.replace(/[^/]+$/,"")),o=0;if(t===r)o=t.length;else for(;t.charCodeAt(o)===r.charCodeAt(o);)o++;return e.map(n=>n.replace(t.slice(0,o),""))}function cr(e){let t=__md_get("__sitemap",sessionStorage,e);if(t)return j(t);{let r=me();return Zo(new URL("sitemap.xml",e||r.base)).pipe(m(o=>Ca(q("loc",o).map(n=>n.textContent))),de(()=>L),He([]),w(o=>__md_set("__sitemap",o,sessionStorage,e)))}}function jn(e){let t=se("[rel=canonical]",e);typeof t!="undefined"&&(t.href=t.href.replace("//localhost:","//127.0.0.1:"));let r=new Map;for(let o of q(":scope > *",e)){let n=o.outerHTML;for(let i of["href","src"]){let s=o.getAttribute(i);if(s===null)continue;let a=new URL(s,t==null?void 0:t.href),c=o.cloneNode();c.setAttribute(i,`${a}`),n=c.outerHTML;break}r.set(n,o)}return r}function Wn({location$:e,viewport$:t,progress$:r}){let o=me();if(location.protocol==="file:")return L;let n=cr().pipe(m(l=>l.map(f=>`${new URL(f,o.base)}`))),i=h(document.body,"click").pipe(ne(n),E(([l,f])=>{if(!(l.target instanceof Element))return L;let u=l.target.closest("a");if(u===null)return L;if(u.target||l.metaKey||l.ctrlKey)return L;let d=new URL(u.href);return d.search=d.hash="",f.includes(`${d}`)?(l.preventDefault(),j(new URL(u.href))):L}),le());i.pipe(xe(1)).subscribe(()=>{let l=se("link[rel=icon]");typeof l!="undefined"&&(l.href=l.href)}),h(window,"beforeunload").subscribe(()=>{history.scrollRestoration="auto"}),i.pipe(ne(t)).subscribe(([l,{offset:f}])=>{history.scrollRestoration="manual",history.replaceState(f,""),history.pushState(null,"",l)}),i.subscribe(e);let s=e.pipe(V(pe()),ee("pathname"),je(1),E(l=>ir(l,{progress$:r}).pipe(de(()=>(ot(l,!0),L))))),a=new DOMParser,c=s.pipe(E(l=>l.text()),E(l=>{let f=a.parseFromString(l,"text/html");for(let b of["[data-md-component=announce]","[data-md-component=container]","[data-md-component=header-topic]","[data-md-component=outdated]","[data-md-component=logo]","[data-md-component=skip]",...te("navigation.tabs.sticky")?["[data-md-component=tabs]"]:[]]){let z=se(b),K=se(b,f);typeof z!="undefined"&&typeof K!="undefined"&&z.replaceWith(K)}let u=jn(document.head),d=jn(f.head);for(let[b,z]of d)z.getAttribute("rel")==="stylesheet"||z.hasAttribute("src")||(u.has(b)?u.delete(b):document.head.appendChild(z));for(let b of u.values())b.getAttribute("rel")==="stylesheet"||b.hasAttribute("src")||b.remove();let v=Ee("container");return Fe(q("script",v)).pipe(E(b=>{let z=f.createElement("script");if(b.src){for(let K of b.getAttributeNames())z.setAttribute(K,b.getAttribute(K));return b.replaceWith(z),new P(K=>{z.onload=()=>K.complete()})}else return z.textContent=b.textContent,b.replaceWith(z),L}),Z(),re(f))}),le());return h(window,"popstate").pipe(m(pe)).subscribe(e),e.pipe(V(pe()),Le(2,1),M(([l,f])=>l.pathname===f.pathname&&l.hash!==f.hash),m(([,l])=>l)).subscribe(l=>{var f,u;history.state!==null||!l.hash?window.scrollTo(0,(u=(f=history.state)==null?void 0:f.y)!=null?u:0):(history.scrollRestoration="auto",nr(l.hash),history.scrollRestoration="manual")}),e.pipe(Cr(i),V(pe()),Le(2,1),M(([l,f])=>l.pathname===f.pathname&&l.hash===f.hash),m(([,l])=>l)).subscribe(l=>{history.scrollRestoration="auto",nr(l.hash),history.scrollRestoration="manual",history.back()}),c.pipe(ne(e)).subscribe(([,l])=>{var f,u;history.state!==null||!l.hash?window.scrollTo(0,(u=(f=history.state)==null?void 0:f.y)!=null?u:0):nr(l.hash)}),t.pipe(ee("offset"),ke(100)).subscribe(({offset:l})=>{history.replaceState(l,"")}),c}var Dn=Ht(Un());function Vn(e){let t=e.separator.split("|").map(n=>n.replace(/(\(\?[!=<][^)]+\))/g,"").length===0?"\uFFFD":n).join("|"),r=new RegExp(t,"img"),o=(n,i,s)=>`${i}${s}`;return n=>{n=n.replace(/[\s*+\-:~^]+/g," ").trim();let i=new RegExp(`(^|${e.separator}|)(${n.replace(/[|\\{}()[\]^$+*?.-]/g,"\\$&").replace(r,"|")})`,"img");return s=>(0,Dn.default)(s).replace(i,o).replace(/<\/mark>(\s+)]*>/img,"$1")}}function Mt(e){return e.type===1}function pr(e){return e.type===3}function zn(e,t){let r=an(e);return _(j(location.protocol!=="file:"),We("search")).pipe($e(o=>o),E(()=>t)).subscribe(({config:o,docs:n})=>r.next({type:0,data:{config:o,docs:n,options:{suggest:te("search.suggest")}}})),r}function qn({document$:e}){let t=me(),r=Ne(new URL("../versions.json",t.base)).pipe(de(()=>L)),o=r.pipe(m(n=>{let[,i]=t.base.match(/([^/]+)\/?$/);return n.find(({version:s,aliases:a})=>s===i||a.includes(i))||n[0]}));r.pipe(m(n=>new Map(n.map(i=>[`${new URL(`../${i.version}/`,t.base)}`,i]))),E(n=>h(document.body,"click").pipe(M(i=>!i.metaKey&&!i.ctrlKey),ne(o),E(([i,s])=>{if(i.target instanceof Element){let a=i.target.closest("a");if(a&&!a.target&&n.has(a.href)){let c=a.href;return!i.target.closest(".md-version")&&n.get(c)===s?L:(i.preventDefault(),j(c))}}return L}),E(i=>{let{version:s}=n.get(i);return cr(new URL(i)).pipe(m(a=>{let p=pe().href.replace(t.base,"");return a.includes(p.split("#")[0])?new URL(`../${s}/${p}`,t.base):new URL(i)}))})))).subscribe(n=>ot(n,!0)),B([r,o]).subscribe(([n,i])=>{U(".md-header__topic").appendChild(hn(n,i))}),e.pipe(E(()=>o)).subscribe(n=>{var s;let i=__md_get("__outdated",sessionStorage);if(i===null){i=!0;let a=((s=t.version)==null?void 0:s.default)||"latest";Array.isArray(a)||(a=[a]);e:for(let c of a)for(let p of n.aliases.concat(n.version))if(new RegExp(c,"i").test(p)){i=!1;break e}__md_set("__outdated",i,sessionStorage)}if(i)for(let a of oe("outdated"))a.hidden=!1})}function Pa(e,{worker$:t}){let{searchParams:r}=pe();r.has("q")&&(Ke("search",!0),e.value=r.get("q"),e.focus(),We("search").pipe($e(i=>!i)).subscribe(()=>{let i=pe();i.searchParams.delete("q"),history.replaceState({},"",`${i}`)}));let o=Zt(e),n=_(t.pipe($e(Mt)),h(e,"keyup"),o).pipe(m(()=>e.value),X());return B([n,o]).pipe(m(([i,s])=>({value:i,focus:s})),J(1))}function Kn(e,{worker$:t}){let r=new x,o=r.pipe(Z(),re(!0));B([t.pipe($e(Mt)),r],(i,s)=>s).pipe(ee("value")).subscribe(({value:i})=>t.next({type:2,data:i})),r.pipe(ee("focus")).subscribe(({focus:i})=>{i&&Ke("search",i)}),h(e.form,"reset").pipe(Y(o)).subscribe(()=>e.focus());let n=U("header [for=__search]");return h(n,"click").subscribe(()=>e.focus()),Pa(e,{worker$:t}).pipe(w(i=>r.next(i)),A(()=>r.complete()),m(i=>R({ref:e},i)),J(1))}function Qn(e,{worker$:t,query$:r}){let o=new x,n=Ko(e.parentElement).pipe(M(Boolean)),i=e.parentElement,s=U(":scope > :first-child",e),a=U(":scope > :last-child",e);We("search").subscribe(l=>a.setAttribute("role",l?"list":"presentation")),o.pipe(ne(r),$r(t.pipe($e(Mt)))).subscribe(([{items:l},{value:f}])=>{switch(l.length){case 0:s.textContent=f.length?be("search.result.none"):be("search.result.placeholder");break;case 1:s.textContent=be("search.result.one");break;default:let u=tr(l.length);s.textContent=be("search.result.other",u)}});let c=o.pipe(w(()=>a.innerHTML=""),E(({items:l})=>_(j(...l.slice(0,10)),j(...l.slice(10)).pipe(Le(4),Ir(n),E(([f])=>f)))),m(fn),le());return c.subscribe(l=>a.appendChild(l)),c.pipe(ce(l=>{let f=se("details",l);return typeof f=="undefined"?L:h(f,"toggle").pipe(Y(o),m(()=>f))})).subscribe(l=>{l.open===!1&&l.offsetTop<=i.scrollTop&&i.scrollTo({top:l.offsetTop})}),t.pipe(M(pr),m(({data:l})=>l)).pipe(w(l=>o.next(l)),A(()=>o.complete()),m(l=>R({ref:e},l)))}function Ia(e,{query$:t}){return t.pipe(m(({value:r})=>{let o=pe();return o.hash="",r=r.replace(/\s+/g,"+").replace(/&/g,"%26").replace(/=/g,"%3D"),o.search=`q=${r}`,{url:o}}))}function Yn(e,t){let r=new x,o=r.pipe(Z(),re(!0));return r.subscribe(({url:n})=>{e.setAttribute("data-clipboard-text",e.href),e.href=`${n}`}),h(e,"click").pipe(Y(o)).subscribe(n=>n.preventDefault()),Ia(e,t).pipe(w(n=>r.next(n)),A(()=>r.complete()),m(n=>R({ref:e},n)))}function Bn(e,{worker$:t,keyboard$:r}){let o=new x,n=Ee("search-query"),i=_(h(n,"keydown"),h(n,"focus")).pipe(Se(ae),m(()=>n.value),X());return o.pipe(Ge(i),m(([{suggest:a},c])=>{let p=c.split(/([\s-]+)/);if(a!=null&&a.length&&p[p.length-1]){let l=a[a.length-1];l.startsWith(p[p.length-1])&&(p[p.length-1]=l)}else p.length=0;return p})).subscribe(a=>e.innerHTML=a.join("").replace(/\s/g," ")),r.pipe(M(({mode:a})=>a==="search")).subscribe(a=>{switch(a.type){case"ArrowRight":e.innerText.length&&n.selectionStart===n.value.length&&(n.value=e.innerText);break}}),t.pipe(M(pr),m(({data:a})=>a)).pipe(w(a=>o.next(a)),A(()=>o.complete()),m(()=>({ref:e})))}function Gn(e,{index$:t,keyboard$:r}){let o=me();try{let n=zn(o.search,t),i=Ee("search-query",e),s=Ee("search-result",e);h(e,"click").pipe(M(({target:c})=>c instanceof Element&&!!c.closest("a"))).subscribe(()=>Ke("search",!1)),r.pipe(M(({mode:c})=>c==="search")).subscribe(c=>{let p=Re();switch(c.type){case"Enter":if(p===i){let l=new Map;for(let f of q(":first-child [href]",s)){let u=f.firstElementChild;l.set(f,parseFloat(u.getAttribute("data-md-score")))}if(l.size){let[[f]]=[...l].sort(([,u],[,d])=>d-u);f.click()}c.claim()}break;case"Escape":case"Tab":Ke("search",!1),i.blur();break;case"ArrowUp":case"ArrowDown":if(typeof p=="undefined")i.focus();else{let l=[i,...q(":not(details) > [href], summary, details[open] [href]",s)],f=Math.max(0,(Math.max(0,l.indexOf(p))+l.length+(c.type==="ArrowUp"?-1:1))%l.length);l[f].focus()}c.claim();break;default:i!==Re()&&i.focus()}}),r.pipe(M(({mode:c})=>c==="global")).subscribe(c=>{switch(c.type){case"f":case"s":case"/":i.focus(),i.select(),c.claim();break}});let a=Kn(i,{worker$:n});return _(a,Qn(s,{worker$:n,query$:a})).pipe(qe(...oe("search-share",e).map(c=>Yn(c,{query$:a})),...oe("search-suggest",e).map(c=>Bn(c,{worker$:n,keyboard$:r}))))}catch(n){return e.hidden=!0,Ve}}function Jn(e,{index$:t,location$:r}){return B([t,r.pipe(V(pe()),M(o=>!!o.searchParams.get("h")))]).pipe(m(([o,n])=>Vn(o.config)(n.searchParams.get("h"))),m(o=>{var s;let n=new Map,i=document.createNodeIterator(e,NodeFilter.SHOW_TEXT);for(let a=i.nextNode();a;a=i.nextNode())if((s=a.parentElement)!=null&&s.offsetHeight){let c=a.textContent,p=o(c);p.length>c.length&&n.set(a,p)}for(let[a,c]of n){let{childNodes:p}=T("span",null,c);a.replaceWith(...Array.from(p))}return{ref:e,nodes:n}}))}function Fa(e,{viewport$:t,main$:r}){let o=e.closest(".md-grid"),n=o.offsetTop-o.parentElement.offsetTop;return B([r,t]).pipe(m(([{offset:i,height:s},{offset:{y:a}}])=>(s=s+Math.min(n,Math.max(0,a-i))-n,{height:s,locked:a>=i+n})),X((i,s)=>i.height===s.height&&i.locked===s.locked))}function Kr(e,o){var n=o,{header$:t}=n,r=eo(n,["header$"]);let i=U(".md-sidebar__scrollwrap",e),{y:s}=Je(i);return H(()=>{let a=new x,c=a.pipe(Z(),re(!0)),p=a.pipe(Ce(0,Oe));return p.pipe(ne(t)).subscribe({next([{height:l},{height:f}]){i.style.height=`${l-2*s}px`,e.style.top=`${f}px`},complete(){i.style.height="",e.style.top=""}}),p.pipe($e()).subscribe(()=>{for(let l of q(".md-nav__link--active[href]",e)){if(!l.clientHeight)continue;let f=l.closest(".md-sidebar__scrollwrap");if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=he(f);f.scrollTo({top:u-d/2})}}}),ge(q("label[tabindex]",e)).pipe(ce(l=>h(l,"click").pipe(Se(ae),m(()=>l),Y(c)))).subscribe(l=>{let f=U(`[id="${l.htmlFor}"]`);U(`[aria-labelledby="${l.id}"]`).setAttribute("aria-expanded",`${f.checked}`)}),Fa(e,r).pipe(w(l=>a.next(l)),A(()=>a.complete()),m(l=>R({ref:e},l)))})}function Xn(e,t){if(typeof t!="undefined"){let r=`https://api.github.com/repos/${e}/${t}`;return St(Ne(`${r}/releases/latest`).pipe(de(()=>L),m(o=>({version:o.tag_name})),He({})),Ne(r).pipe(de(()=>L),m(o=>({stars:o.stargazers_count,forks:o.forks_count})),He({}))).pipe(m(([o,n])=>R(R({},o),n)))}else{let r=`https://api.github.com/users/${e}`;return Ne(r).pipe(m(o=>({repositories:o.public_repos})),He({}))}}function Zn(e,t){let r=`https://${e}/api/v4/projects/${encodeURIComponent(t)}`;return Ne(r).pipe(de(()=>L),m(({star_count:o,forks_count:n})=>({stars:o,forks:n})),He({}))}function ei(e){let t=e.match(/^.+github\.com\/([^/]+)\/?([^/]+)?/i);if(t){let[,r,o]=t;return Xn(r,o)}if(t=e.match(/^.+?([^/]*gitlab[^/]+)\/(.+?)\/?$/i),t){let[,r,o]=t;return Zn(r,o)}return L}var ja;function Wa(e){return ja||(ja=H(()=>{let t=__md_get("__source",sessionStorage);if(t)return j(t);if(oe("consent").length){let o=__md_get("__consent");if(!(o&&o.github))return L}return ei(e.href).pipe(w(o=>__md_set("__source",o,sessionStorage)))}).pipe(de(()=>L),M(t=>Object.keys(t).length>0),m(t=>({facts:t})),J(1)))}function ti(e){let t=U(":scope > :last-child",e);return H(()=>{let r=new x;return r.subscribe(({facts:o})=>{t.appendChild(un(o)),t.classList.add("md-source__repository--active")}),Wa(e).pipe(w(o=>r.next(o)),A(()=>r.complete()),m(o=>R({ref:e},o)))})}function Na(e,{viewport$:t,header$:r}){return ye(document.body).pipe(E(()=>ar(e,{header$:r,viewport$:t})),m(({offset:{y:o}})=>({hidden:o>=10})),ee("hidden"))}function ri(e,t){return H(()=>{let r=new x;return r.subscribe({next({hidden:o}){e.hidden=o},complete(){e.hidden=!1}}),(te("navigation.tabs.sticky")?j({hidden:!1}):Na(e,t)).pipe(w(o=>r.next(o)),A(()=>r.complete()),m(o=>R({ref:e},o)))})}function Ua(e,{viewport$:t,header$:r}){let o=new Map,n=q("[href^=\\#]",e);for(let a of n){let c=decodeURIComponent(a.hash.substring(1)),p=se(`[id="${c}"]`);typeof p!="undefined"&&o.set(a,p)}let i=r.pipe(ee("height"),m(({height:a})=>{let c=Ee("main"),p=U(":scope > :first-child",c);return a+.8*(p.offsetTop-c.offsetTop)}),le());return ye(document.body).pipe(ee("height"),E(a=>H(()=>{let c=[];return j([...o].reduce((p,[l,f])=>{for(;c.length&&o.get(c[c.length-1]).tagName>=f.tagName;)c.pop();let u=f.offsetTop;for(;!u&&f.parentElement;)f=f.parentElement,u=f.offsetTop;let d=f.offsetParent;for(;d;d=d.offsetParent)u+=d.offsetTop;return p.set([...c=[...c,l]].reverse(),u)},new Map))}).pipe(m(c=>new Map([...c].sort(([,p],[,l])=>p-l))),Ge(i),E(([c,p])=>t.pipe(kr(([l,f],{offset:{y:u},size:d})=>{let v=u+d.height>=Math.floor(a.height);for(;f.length;){let[,b]=f[0];if(b-p=u&&!v)f=[l.pop(),...f];else break}return[l,f]},[[],[...c]]),X((l,f)=>l[0]===f[0]&&l[1]===f[1])))))).pipe(m(([a,c])=>({prev:a.map(([p])=>p),next:c.map(([p])=>p)})),V({prev:[],next:[]}),Le(2,1),m(([a,c])=>a.prev.length{let i=new x,s=i.pipe(Z(),re(!0));if(i.subscribe(({prev:a,next:c})=>{for(let[p]of c)p.classList.remove("md-nav__link--passed"),p.classList.remove("md-nav__link--active");for(let[p,[l]]of a.entries())l.classList.add("md-nav__link--passed"),l.classList.toggle("md-nav__link--active",p===a.length-1)}),te("toc.follow")){let a=_(t.pipe(ke(1),m(()=>{})),t.pipe(ke(250),m(()=>"smooth")));i.pipe(M(({prev:c})=>c.length>0),Ge(o.pipe(Se(ae))),ne(a)).subscribe(([[{prev:c}],p])=>{let[l]=c[c.length-1];if(l.offsetHeight){let f=zo(l);if(typeof f!="undefined"){let u=l.offsetTop-f.offsetTop,{height:d}=he(f);f.scrollTo({top:u-d/2,behavior:p})}}})}return te("navigation.tracking")&&t.pipe(Y(s),ee("offset"),ke(250),je(1),Y(n.pipe(je(1))),Tt({delay:250}),ne(i)).subscribe(([,{prev:a}])=>{let c=pe(),p=a[a.length-1];if(p&&p.length){let[l]=p,{hash:f}=new URL(l.href);c.hash!==f&&(c.hash=f,history.replaceState({},"",`${c}`))}else c.hash="",history.replaceState({},"",`${c}`)}),Ua(e,{viewport$:t,header$:r}).pipe(w(a=>i.next(a)),A(()=>i.complete()),m(a=>R({ref:e},a)))})}function Da(e,{viewport$:t,main$:r,target$:o}){let n=t.pipe(m(({offset:{y:s}})=>s),Le(2,1),m(([s,a])=>s>a&&a>0),X()),i=r.pipe(m(({active:s})=>s));return B([i,n]).pipe(m(([s,a])=>!(s&&a)),X(),Y(o.pipe(je(1))),re(!0),Tt({delay:250}),m(s=>({hidden:s})))}function ni(e,{viewport$:t,header$:r,main$:o,target$:n}){let i=new x,s=i.pipe(Z(),re(!0));return i.subscribe({next({hidden:a}){e.hidden=a,a?(e.setAttribute("tabindex","-1"),e.blur()):e.removeAttribute("tabindex")},complete(){e.style.top="",e.hidden=!0,e.removeAttribute("tabindex")}}),r.pipe(Y(s),ee("height")).subscribe(({height:a})=>{e.style.top=`${a+16}px`}),h(e,"click").subscribe(a=>{a.preventDefault(),window.scrollTo({top:0})}),Da(e,{viewport$:t,main$:o,target$:n}).pipe(w(a=>i.next(a)),A(()=>i.complete()),m(a=>R({ref:e},a)))}function ii({document$:e,tablet$:t}){e.pipe(E(()=>q(".md-toggle--indeterminate")),w(r=>{r.indeterminate=!0,r.checked=!1}),ce(r=>h(r,"change").pipe(Rr(()=>r.classList.contains("md-toggle--indeterminate")),m(()=>r))),ne(t)).subscribe(([r,o])=>{r.classList.remove("md-toggle--indeterminate"),o&&(r.checked=!1)})}function Va(){return/(iPad|iPhone|iPod)/.test(navigator.userAgent)}function ai({document$:e}){e.pipe(E(()=>q("[data-md-scrollfix]")),w(t=>t.removeAttribute("data-md-scrollfix")),M(Va),ce(t=>h(t,"touchstart").pipe(m(()=>t)))).subscribe(t=>{let r=t.scrollTop;r===0?t.scrollTop=1:r+t.offsetHeight===t.scrollHeight&&(t.scrollTop=r-1)})}function si({viewport$:e,tablet$:t}){B([We("search"),t]).pipe(m(([r,o])=>r&&!o),E(r=>j(r).pipe(ze(r?400:100))),ne(e)).subscribe(([r,{offset:{y:o}}])=>{if(r)document.body.setAttribute("data-md-scrolllock",""),document.body.style.top=`-${o}px`;else{let n=-1*parseInt(document.body.style.top,10);document.body.removeAttribute("data-md-scrolllock"),document.body.style.top="",n&&window.scrollTo(0,n)}})}Object.entries||(Object.entries=function(e){let t=[];for(let r of Object.keys(e))t.push([r,e[r]]);return t});Object.values||(Object.values=function(e){let t=[];for(let r of Object.keys(e))t.push(e[r]);return t});typeof Element!="undefined"&&(Element.prototype.scrollTo||(Element.prototype.scrollTo=function(e,t){typeof e=="object"?(this.scrollLeft=e.left,this.scrollTop=e.top):(this.scrollLeft=e,this.scrollTop=t)}),Element.prototype.replaceWith||(Element.prototype.replaceWith=function(...e){let t=this.parentNode;if(t){e.length===0&&t.removeChild(this);for(let r=e.length-1;r>=0;r--){let o=e[r];typeof o=="string"?o=document.createTextNode(o):o.parentNode&&o.parentNode.removeChild(o),r?t.insertBefore(this.previousSibling,o):t.replaceChild(o,this)}}}));function za(){return location.protocol==="file:"?ht(`${new URL("search/search_index.js",Qr.base)}`).pipe(m(()=>__index),J(1)):Ne(new URL("search/search_index.json",Qr.base))}document.documentElement.classList.remove("no-js");document.documentElement.classList.add("js");var nt=No(),_t=Bo(),gt=Jo(_t),Yr=Yo(),Te=nn(),lr=Fr("(min-width: 960px)"),pi=Fr("(min-width: 1220px)"),li=Xo(),Qr=me(),mi=document.forms.namedItem("search")?za():Ve,Br=new x;Fn({alert$:Br});var Gr=new x;te("navigation.instant")&&Wn({location$:_t,viewport$:Te,progress$:Gr}).subscribe(nt);var ci;((ci=Qr.version)==null?void 0:ci.provider)==="mike"&&qn({document$:nt});_(_t,gt).pipe(ze(125)).subscribe(()=>{Ke("drawer",!1),Ke("search",!1)});Yr.pipe(M(({mode:e})=>e==="global")).subscribe(e=>{switch(e.type){case"p":case",":let t=se("link[rel=prev]");typeof t!="undefined"&&ot(t);break;case"n":case".":let r=se("link[rel=next]");typeof r!="undefined"&&ot(r);break;case"Enter":let o=Re();o instanceof HTMLLabelElement&&o.click()}});ii({document$:nt,tablet$:lr});ai({document$:nt});si({viewport$:Te,tablet$:lr});var Xe=kn(Ee("header"),{viewport$:Te}),Lt=nt.pipe(m(()=>Ee("main")),E(e=>Rn(e,{viewport$:Te,header$:Xe})),J(1)),qa=_(...oe("consent").map(e=>cn(e,{target$:gt})),...oe("dialog").map(e=>Cn(e,{alert$:Br})),...oe("header").map(e=>Hn(e,{viewport$:Te,header$:Xe,main$:Lt})),...oe("palette").map(e=>Pn(e)),...oe("progress").map(e=>In(e,{progress$:Gr})),...oe("search").map(e=>Gn(e,{index$:mi,keyboard$:Yr})),...oe("source").map(e=>ti(e))),Ka=H(()=>_(...oe("announce").map(e=>sn(e)),...oe("content").map(e=>An(e,{viewport$:Te,target$:gt,print$:li})),...oe("content").map(e=>te("search.highlight")?Jn(e,{index$:mi,location$:_t}):L),...oe("header-title").map(e=>$n(e,{viewport$:Te,header$:Xe})),...oe("sidebar").map(e=>e.getAttribute("data-md-type")==="navigation"?jr(pi,()=>Kr(e,{viewport$:Te,header$:Xe,main$:Lt})):jr(lr,()=>Kr(e,{viewport$:Te,header$:Xe,main$:Lt}))),...oe("tabs").map(e=>ri(e,{viewport$:Te,header$:Xe})),...oe("toc").map(e=>oi(e,{viewport$:Te,header$:Xe,main$:Lt,target$:gt})),...oe("top").map(e=>ni(e,{viewport$:Te,header$:Xe,main$:Lt,target$:gt})))),fi=nt.pipe(E(()=>Ka),qe(qa),J(1));fi.subscribe();window.document$=nt;window.location$=_t;window.target$=gt;window.keyboard$=Yr;window.viewport$=Te;window.tablet$=lr;window.screen$=pi;window.print$=li;window.alert$=Br;window.progress$=Gr;window.component$=fi;})(); +//# sourceMappingURL=bundle.6c14ae12.min.js.map + diff --git a/assets/javascripts/bundle.6c14ae12.min.js.map b/assets/javascripts/bundle.6c14ae12.min.js.map new file mode 100644 index 00000000..49396ad0 --- /dev/null +++ b/assets/javascripts/bundle.6c14ae12.min.js.map @@ -0,0 +1,7 @@ +{ + "version": 3, + "sources": ["node_modules/focus-visible/dist/focus-visible.js", "node_modules/clipboard/dist/clipboard.js", "node_modules/escape-html/index.js", "src/templates/assets/javascripts/bundle.ts", "node_modules/rxjs/node_modules/tslib/tslib.es6.js", "node_modules/rxjs/src/internal/util/isFunction.ts", "node_modules/rxjs/src/internal/util/createErrorClass.ts", "node_modules/rxjs/src/internal/util/UnsubscriptionError.ts", "node_modules/rxjs/src/internal/util/arrRemove.ts", "node_modules/rxjs/src/internal/Subscription.ts", "node_modules/rxjs/src/internal/config.ts", "node_modules/rxjs/src/internal/scheduler/timeoutProvider.ts", "node_modules/rxjs/src/internal/util/reportUnhandledError.ts", "node_modules/rxjs/src/internal/util/noop.ts", "node_modules/rxjs/src/internal/NotificationFactories.ts", "node_modules/rxjs/src/internal/util/errorContext.ts", "node_modules/rxjs/src/internal/Subscriber.ts", "node_modules/rxjs/src/internal/symbol/observable.ts", "node_modules/rxjs/src/internal/util/identity.ts", "node_modules/rxjs/src/internal/util/pipe.ts", "node_modules/rxjs/src/internal/Observable.ts", "node_modules/rxjs/src/internal/util/lift.ts", "node_modules/rxjs/src/internal/operators/OperatorSubscriber.ts", "node_modules/rxjs/src/internal/scheduler/animationFrameProvider.ts", "node_modules/rxjs/src/internal/util/ObjectUnsubscribedError.ts", "node_modules/rxjs/src/internal/Subject.ts", "node_modules/rxjs/src/internal/scheduler/dateTimestampProvider.ts", "node_modules/rxjs/src/internal/ReplaySubject.ts", "node_modules/rxjs/src/internal/scheduler/Action.ts", "node_modules/rxjs/src/internal/scheduler/intervalProvider.ts", "node_modules/rxjs/src/internal/scheduler/AsyncAction.ts", "node_modules/rxjs/src/internal/Scheduler.ts", "node_modules/rxjs/src/internal/scheduler/AsyncScheduler.ts", "node_modules/rxjs/src/internal/scheduler/async.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameAction.ts", "node_modules/rxjs/src/internal/scheduler/AnimationFrameScheduler.ts", "node_modules/rxjs/src/internal/scheduler/animationFrame.ts", "node_modules/rxjs/src/internal/observable/empty.ts", "node_modules/rxjs/src/internal/util/isScheduler.ts", "node_modules/rxjs/src/internal/util/args.ts", "node_modules/rxjs/src/internal/util/isArrayLike.ts", "node_modules/rxjs/src/internal/util/isPromise.ts", "node_modules/rxjs/src/internal/util/isInteropObservable.ts", "node_modules/rxjs/src/internal/util/isAsyncIterable.ts", "node_modules/rxjs/src/internal/util/throwUnobservableError.ts", "node_modules/rxjs/src/internal/symbol/iterator.ts", "node_modules/rxjs/src/internal/util/isIterable.ts", "node_modules/rxjs/src/internal/util/isReadableStreamLike.ts", "node_modules/rxjs/src/internal/observable/innerFrom.ts", "node_modules/rxjs/src/internal/util/executeSchedule.ts", "node_modules/rxjs/src/internal/operators/observeOn.ts", "node_modules/rxjs/src/internal/operators/subscribeOn.ts", "node_modules/rxjs/src/internal/scheduled/scheduleObservable.ts", "node_modules/rxjs/src/internal/scheduled/schedulePromise.ts", "node_modules/rxjs/src/internal/scheduled/scheduleArray.ts", "node_modules/rxjs/src/internal/scheduled/scheduleIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleAsyncIterable.ts", "node_modules/rxjs/src/internal/scheduled/scheduleReadableStreamLike.ts", "node_modules/rxjs/src/internal/scheduled/scheduled.ts", "node_modules/rxjs/src/internal/observable/from.ts", "node_modules/rxjs/src/internal/observable/of.ts", "node_modules/rxjs/src/internal/observable/throwError.ts", "node_modules/rxjs/src/internal/util/EmptyError.ts", "node_modules/rxjs/src/internal/util/isDate.ts", "node_modules/rxjs/src/internal/operators/map.ts", "node_modules/rxjs/src/internal/util/mapOneOrManyArgs.ts", "node_modules/rxjs/src/internal/util/argsArgArrayOrObject.ts", "node_modules/rxjs/src/internal/util/createObject.ts", "node_modules/rxjs/src/internal/observable/combineLatest.ts", "node_modules/rxjs/src/internal/operators/mergeInternals.ts", "node_modules/rxjs/src/internal/operators/mergeMap.ts", "node_modules/rxjs/src/internal/operators/mergeAll.ts", "node_modules/rxjs/src/internal/operators/concatAll.ts", "node_modules/rxjs/src/internal/observable/concat.ts", "node_modules/rxjs/src/internal/observable/defer.ts", "node_modules/rxjs/src/internal/observable/fromEvent.ts", "node_modules/rxjs/src/internal/observable/fromEventPattern.ts", "node_modules/rxjs/src/internal/observable/timer.ts", "node_modules/rxjs/src/internal/observable/merge.ts", "node_modules/rxjs/src/internal/observable/never.ts", "node_modules/rxjs/src/internal/util/argsOrArgArray.ts", "node_modules/rxjs/src/internal/operators/filter.ts", "node_modules/rxjs/src/internal/observable/zip.ts", "node_modules/rxjs/src/internal/operators/audit.ts", "node_modules/rxjs/src/internal/operators/auditTime.ts", "node_modules/rxjs/src/internal/operators/bufferCount.ts", "node_modules/rxjs/src/internal/operators/catchError.ts", "node_modules/rxjs/src/internal/operators/scanInternals.ts", "node_modules/rxjs/src/internal/operators/combineLatest.ts", "node_modules/rxjs/src/internal/operators/combineLatestWith.ts", "node_modules/rxjs/src/internal/operators/debounceTime.ts", "node_modules/rxjs/src/internal/operators/defaultIfEmpty.ts", "node_modules/rxjs/src/internal/operators/take.ts", "node_modules/rxjs/src/internal/operators/ignoreElements.ts", "node_modules/rxjs/src/internal/operators/mapTo.ts", "node_modules/rxjs/src/internal/operators/delayWhen.ts", "node_modules/rxjs/src/internal/operators/delay.ts", "node_modules/rxjs/src/internal/operators/distinctUntilChanged.ts", "node_modules/rxjs/src/internal/operators/distinctUntilKeyChanged.ts", "node_modules/rxjs/src/internal/operators/throwIfEmpty.ts", "node_modules/rxjs/src/internal/operators/endWith.ts", "node_modules/rxjs/src/internal/operators/finalize.ts", "node_modules/rxjs/src/internal/operators/first.ts", "node_modules/rxjs/src/internal/operators/merge.ts", "node_modules/rxjs/src/internal/operators/mergeWith.ts", "node_modules/rxjs/src/internal/operators/repeat.ts", "node_modules/rxjs/src/internal/operators/sample.ts", "node_modules/rxjs/src/internal/operators/scan.ts", "node_modules/rxjs/src/internal/operators/share.ts", "node_modules/rxjs/src/internal/operators/shareReplay.ts", "node_modules/rxjs/src/internal/operators/skip.ts", "node_modules/rxjs/src/internal/operators/skipUntil.ts", "node_modules/rxjs/src/internal/operators/startWith.ts", "node_modules/rxjs/src/internal/operators/switchMap.ts", "node_modules/rxjs/src/internal/operators/takeUntil.ts", "node_modules/rxjs/src/internal/operators/takeWhile.ts", "node_modules/rxjs/src/internal/operators/tap.ts", "node_modules/rxjs/src/internal/operators/throttle.ts", "node_modules/rxjs/src/internal/operators/throttleTime.ts", "node_modules/rxjs/src/internal/operators/withLatestFrom.ts", "node_modules/rxjs/src/internal/operators/zip.ts", "node_modules/rxjs/src/internal/operators/zipWith.ts", "src/templates/assets/javascripts/browser/document/index.ts", "src/templates/assets/javascripts/browser/element/_/index.ts", "src/templates/assets/javascripts/browser/element/focus/index.ts", "src/templates/assets/javascripts/browser/element/offset/_/index.ts", "src/templates/assets/javascripts/browser/element/offset/content/index.ts", "src/templates/assets/javascripts/utilities/h/index.ts", "src/templates/assets/javascripts/utilities/round/index.ts", "src/templates/assets/javascripts/browser/script/index.ts", "src/templates/assets/javascripts/browser/element/size/_/index.ts", "src/templates/assets/javascripts/browser/element/size/content/index.ts", "src/templates/assets/javascripts/browser/element/visibility/index.ts", "src/templates/assets/javascripts/browser/toggle/index.ts", "src/templates/assets/javascripts/browser/keyboard/index.ts", "src/templates/assets/javascripts/browser/location/_/index.ts", "src/templates/assets/javascripts/browser/location/hash/index.ts", "src/templates/assets/javascripts/browser/media/index.ts", "src/templates/assets/javascripts/browser/request/index.ts", "src/templates/assets/javascripts/browser/viewport/offset/index.ts", "src/templates/assets/javascripts/browser/viewport/size/index.ts", "src/templates/assets/javascripts/browser/viewport/_/index.ts", "src/templates/assets/javascripts/browser/viewport/at/index.ts", "src/templates/assets/javascripts/browser/worker/index.ts", "src/templates/assets/javascripts/_/index.ts", "src/templates/assets/javascripts/components/_/index.ts", "src/templates/assets/javascripts/components/announce/index.ts", "src/templates/assets/javascripts/components/consent/index.ts", "src/templates/assets/javascripts/components/content/annotation/_/index.ts", "src/templates/assets/javascripts/templates/tooltip/index.tsx", "src/templates/assets/javascripts/templates/annotation/index.tsx", "src/templates/assets/javascripts/templates/clipboard/index.tsx", "src/templates/assets/javascripts/templates/search/index.tsx", "src/templates/assets/javascripts/templates/source/index.tsx", "src/templates/assets/javascripts/templates/tabbed/index.tsx", "src/templates/assets/javascripts/templates/table/index.tsx", "src/templates/assets/javascripts/templates/version/index.tsx", "src/templates/assets/javascripts/components/content/annotation/list/index.ts", "src/templates/assets/javascripts/components/content/annotation/block/index.ts", "src/templates/assets/javascripts/components/content/code/_/index.ts", "src/templates/assets/javascripts/components/content/details/index.ts", "src/templates/assets/javascripts/components/content/mermaid/index.css", "src/templates/assets/javascripts/components/content/mermaid/index.ts", "src/templates/assets/javascripts/components/content/table/index.ts", "src/templates/assets/javascripts/components/content/tabs/index.ts", "src/templates/assets/javascripts/components/content/_/index.ts", "src/templates/assets/javascripts/components/dialog/index.ts", "src/templates/assets/javascripts/components/header/_/index.ts", "src/templates/assets/javascripts/components/header/title/index.ts", "src/templates/assets/javascripts/components/main/index.ts", "src/templates/assets/javascripts/components/palette/index.ts", "src/templates/assets/javascripts/components/progress/index.ts", "src/templates/assets/javascripts/integrations/clipboard/index.ts", "src/templates/assets/javascripts/integrations/sitemap/index.ts", "src/templates/assets/javascripts/integrations/instant/index.ts", "src/templates/assets/javascripts/integrations/search/highlighter/index.ts", "src/templates/assets/javascripts/integrations/search/worker/message/index.ts", "src/templates/assets/javascripts/integrations/search/worker/_/index.ts", "src/templates/assets/javascripts/integrations/version/index.ts", "src/templates/assets/javascripts/components/search/query/index.ts", "src/templates/assets/javascripts/components/search/result/index.ts", "src/templates/assets/javascripts/components/search/share/index.ts", "src/templates/assets/javascripts/components/search/suggest/index.ts", "src/templates/assets/javascripts/components/search/_/index.ts", "src/templates/assets/javascripts/components/search/highlight/index.ts", "src/templates/assets/javascripts/components/sidebar/index.ts", "src/templates/assets/javascripts/components/source/facts/github/index.ts", "src/templates/assets/javascripts/components/source/facts/gitlab/index.ts", "src/templates/assets/javascripts/components/source/facts/_/index.ts", "src/templates/assets/javascripts/components/source/_/index.ts", "src/templates/assets/javascripts/components/tabs/index.ts", "src/templates/assets/javascripts/components/toc/index.ts", "src/templates/assets/javascripts/components/top/index.ts", "src/templates/assets/javascripts/patches/indeterminate/index.ts", "src/templates/assets/javascripts/patches/scrollfix/index.ts", "src/templates/assets/javascripts/patches/scrolllock/index.ts", "src/templates/assets/javascripts/polyfills/index.ts"], + "sourcesContent": ["(function (global, factory) {\n typeof exports === 'object' && typeof module !== 'undefined' ? factory() :\n typeof define === 'function' && define.amd ? define(factory) :\n (factory());\n}(this, (function () { 'use strict';\n\n /**\n * Applies the :focus-visible polyfill at the given scope.\n * A scope in this case is either the top-level Document or a Shadow Root.\n *\n * @param {(Document|ShadowRoot)} scope\n * @see https://github.com/WICG/focus-visible\n */\n function applyFocusVisiblePolyfill(scope) {\n var hadKeyboardEvent = true;\n var hadFocusVisibleRecently = false;\n var hadFocusVisibleRecentlyTimeout = null;\n\n var inputTypesAllowlist = {\n text: true,\n search: true,\n url: true,\n tel: true,\n email: true,\n password: true,\n number: true,\n date: true,\n month: true,\n week: true,\n time: true,\n datetime: true,\n 'datetime-local': true\n };\n\n /**\n * Helper function for legacy browsers and iframes which sometimes focus\n * elements like document, body, and non-interactive SVG.\n * @param {Element} el\n */\n function isValidFocusTarget(el) {\n if (\n el &&\n el !== document &&\n el.nodeName !== 'HTML' &&\n el.nodeName !== 'BODY' &&\n 'classList' in el &&\n 'contains' in el.classList\n ) {\n return true;\n }\n return false;\n }\n\n /**\n * Computes whether the given element should automatically trigger the\n * `focus-visible` class being added, i.e. whether it should always match\n * `:focus-visible` when focused.\n * @param {Element} el\n * @return {boolean}\n */\n function focusTriggersKeyboardModality(el) {\n var type = el.type;\n var tagName = el.tagName;\n\n if (tagName === 'INPUT' && inputTypesAllowlist[type] && !el.readOnly) {\n return true;\n }\n\n if (tagName === 'TEXTAREA' && !el.readOnly) {\n return true;\n }\n\n if (el.isContentEditable) {\n return true;\n }\n\n return false;\n }\n\n /**\n * Add the `focus-visible` class to the given element if it was not added by\n * the author.\n * @param {Element} el\n */\n function addFocusVisibleClass(el) {\n if (el.classList.contains('focus-visible')) {\n return;\n }\n el.classList.add('focus-visible');\n el.setAttribute('data-focus-visible-added', '');\n }\n\n /**\n * Remove the `focus-visible` class from the given element if it was not\n * originally added by the author.\n * @param {Element} el\n */\n function removeFocusVisibleClass(el) {\n if (!el.hasAttribute('data-focus-visible-added')) {\n return;\n }\n el.classList.remove('focus-visible');\n el.removeAttribute('data-focus-visible-added');\n }\n\n /**\n * If the most recent user interaction was via the keyboard;\n * and the key press did not include a meta, alt/option, or control key;\n * then the modality is keyboard. Otherwise, the modality is not keyboard.\n * Apply `focus-visible` to any current active element and keep track\n * of our keyboard modality state with `hadKeyboardEvent`.\n * @param {KeyboardEvent} e\n */\n function onKeyDown(e) {\n if (e.metaKey || e.altKey || e.ctrlKey) {\n return;\n }\n\n if (isValidFocusTarget(scope.activeElement)) {\n addFocusVisibleClass(scope.activeElement);\n }\n\n hadKeyboardEvent = true;\n }\n\n /**\n * If at any point a user clicks with a pointing device, ensure that we change\n * the modality away from keyboard.\n * This avoids the situation where a user presses a key on an already focused\n * element, and then clicks on a different element, focusing it with a\n * pointing device, while we still think we're in keyboard modality.\n * @param {Event} e\n */\n function onPointerDown(e) {\n hadKeyboardEvent = false;\n }\n\n /**\n * On `focus`, add the `focus-visible` class to the target if:\n * - the target received focus as a result of keyboard navigation, or\n * - the event target is an element that will likely require interaction\n * via the keyboard (e.g. a text box)\n * @param {Event} e\n */\n function onFocus(e) {\n // Prevent IE from focusing the document or HTML element.\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (hadKeyboardEvent || focusTriggersKeyboardModality(e.target)) {\n addFocusVisibleClass(e.target);\n }\n }\n\n /**\n * On `blur`, remove the `focus-visible` class from the target.\n * @param {Event} e\n */\n function onBlur(e) {\n if (!isValidFocusTarget(e.target)) {\n return;\n }\n\n if (\n e.target.classList.contains('focus-visible') ||\n e.target.hasAttribute('data-focus-visible-added')\n ) {\n // To detect a tab/window switch, we look for a blur event followed\n // rapidly by a visibility change.\n // If we don't see a visibility change within 100ms, it's probably a\n // regular focus change.\n hadFocusVisibleRecently = true;\n window.clearTimeout(hadFocusVisibleRecentlyTimeout);\n hadFocusVisibleRecentlyTimeout = window.setTimeout(function() {\n hadFocusVisibleRecently = false;\n }, 100);\n removeFocusVisibleClass(e.target);\n }\n }\n\n /**\n * If the user changes tabs, keep track of whether or not the previously\n * focused element had .focus-visible.\n * @param {Event} e\n */\n function onVisibilityChange(e) {\n if (document.visibilityState === 'hidden') {\n // If the tab becomes active again, the browser will handle calling focus\n // on the element (Safari actually calls it twice).\n // If this tab change caused a blur on an element with focus-visible,\n // re-apply the class when the user switches back to the tab.\n if (hadFocusVisibleRecently) {\n hadKeyboardEvent = true;\n }\n addInitialPointerMoveListeners();\n }\n }\n\n /**\n * Add a group of listeners to detect usage of any pointing devices.\n * These listeners will be added when the polyfill first loads, and anytime\n * the window is blurred, so that they are active when the window regains\n * focus.\n */\n function addInitialPointerMoveListeners() {\n document.addEventListener('mousemove', onInitialPointerMove);\n document.addEventListener('mousedown', onInitialPointerMove);\n document.addEventListener('mouseup', onInitialPointerMove);\n document.addEventListener('pointermove', onInitialPointerMove);\n document.addEventListener('pointerdown', onInitialPointerMove);\n document.addEventListener('pointerup', onInitialPointerMove);\n document.addEventListener('touchmove', onInitialPointerMove);\n document.addEventListener('touchstart', onInitialPointerMove);\n document.addEventListener('touchend', onInitialPointerMove);\n }\n\n function removeInitialPointerMoveListeners() {\n document.removeEventListener('mousemove', onInitialPointerMove);\n document.removeEventListener('mousedown', onInitialPointerMove);\n document.removeEventListener('mouseup', onInitialPointerMove);\n document.removeEventListener('pointermove', onInitialPointerMove);\n document.removeEventListener('pointerdown', onInitialPointerMove);\n document.removeEventListener('pointerup', onInitialPointerMove);\n document.removeEventListener('touchmove', onInitialPointerMove);\n document.removeEventListener('touchstart', onInitialPointerMove);\n document.removeEventListener('touchend', onInitialPointerMove);\n }\n\n /**\n * When the polfyill first loads, assume the user is in keyboard modality.\n * If any event is received from a pointing device (e.g. mouse, pointer,\n * touch), turn off keyboard modality.\n * This accounts for situations where focus enters the page from the URL bar.\n * @param {Event} e\n */\n function onInitialPointerMove(e) {\n // Work around a Safari quirk that fires a mousemove on whenever the\n // window blurs, even if you're tabbing out of the page. \u00AF\\_(\u30C4)_/\u00AF\n if (e.target.nodeName && e.target.nodeName.toLowerCase() === 'html') {\n return;\n }\n\n hadKeyboardEvent = false;\n removeInitialPointerMoveListeners();\n }\n\n // For some kinds of state, we are interested in changes at the global scope\n // only. For example, global pointer input, global key presses and global\n // visibility change should affect the state at every scope:\n document.addEventListener('keydown', onKeyDown, true);\n document.addEventListener('mousedown', onPointerDown, true);\n document.addEventListener('pointerdown', onPointerDown, true);\n document.addEventListener('touchstart', onPointerDown, true);\n document.addEventListener('visibilitychange', onVisibilityChange, true);\n\n addInitialPointerMoveListeners();\n\n // For focus and blur, we specifically care about state changes in the local\n // scope. This is because focus / blur events that originate from within a\n // shadow root are not re-dispatched from the host element if it was already\n // the active element in its own scope:\n scope.addEventListener('focus', onFocus, true);\n scope.addEventListener('blur', onBlur, true);\n\n // We detect that a node is a ShadowRoot by ensuring that it is a\n // DocumentFragment and also has a host property. This check covers native\n // implementation and polyfill implementation transparently. If we only cared\n // about the native implementation, we could just check if the scope was\n // an instance of a ShadowRoot.\n if (scope.nodeType === Node.DOCUMENT_FRAGMENT_NODE && scope.host) {\n // Since a ShadowRoot is a special kind of DocumentFragment, it does not\n // have a root element to add a class to. So, we add this attribute to the\n // host element instead:\n scope.host.setAttribute('data-js-focus-visible', '');\n } else if (scope.nodeType === Node.DOCUMENT_NODE) {\n document.documentElement.classList.add('js-focus-visible');\n document.documentElement.setAttribute('data-js-focus-visible', '');\n }\n }\n\n // It is important to wrap all references to global window and document in\n // these checks to support server-side rendering use cases\n // @see https://github.com/WICG/focus-visible/issues/199\n if (typeof window !== 'undefined' && typeof document !== 'undefined') {\n // Make the polyfill helper globally available. This can be used as a signal\n // to interested libraries that wish to coordinate with the polyfill for e.g.,\n // applying the polyfill to a shadow root:\n window.applyFocusVisiblePolyfill = applyFocusVisiblePolyfill;\n\n // Notify interested libraries of the polyfill's presence, in case the\n // polyfill was loaded lazily:\n var event;\n\n try {\n event = new CustomEvent('focus-visible-polyfill-ready');\n } catch (error) {\n // IE11 does not support using CustomEvent as a constructor directly:\n event = document.createEvent('CustomEvent');\n event.initCustomEvent('focus-visible-polyfill-ready', false, false, {});\n }\n\n window.dispatchEvent(event);\n }\n\n if (typeof document !== 'undefined') {\n // Apply the polyfill to the global document, so that no JavaScript\n // coordination is required to use the polyfill in the top-level document:\n applyFocusVisiblePolyfill(document);\n }\n\n})));\n", "/*!\n * clipboard.js v2.0.11\n * https://clipboardjs.com/\n *\n * Licensed MIT \u00A9 Zeno Rocha\n */\n(function webpackUniversalModuleDefinition(root, factory) {\n\tif(typeof exports === 'object' && typeof module === 'object')\n\t\tmodule.exports = factory();\n\telse if(typeof define === 'function' && define.amd)\n\t\tdefine([], factory);\n\telse if(typeof exports === 'object')\n\t\texports[\"ClipboardJS\"] = factory();\n\telse\n\t\troot[\"ClipboardJS\"] = factory();\n})(this, function() {\nreturn /******/ (function() { // webpackBootstrap\n/******/ \tvar __webpack_modules__ = ({\n\n/***/ 686:\n/***/ (function(__unused_webpack_module, __webpack_exports__, __webpack_require__) {\n\n\"use strict\";\n\n// EXPORTS\n__webpack_require__.d(__webpack_exports__, {\n \"default\": function() { return /* binding */ clipboard; }\n});\n\n// EXTERNAL MODULE: ./node_modules/tiny-emitter/index.js\nvar tiny_emitter = __webpack_require__(279);\nvar tiny_emitter_default = /*#__PURE__*/__webpack_require__.n(tiny_emitter);\n// EXTERNAL MODULE: ./node_modules/good-listener/src/listen.js\nvar listen = __webpack_require__(370);\nvar listen_default = /*#__PURE__*/__webpack_require__.n(listen);\n// EXTERNAL MODULE: ./node_modules/select/src/select.js\nvar src_select = __webpack_require__(817);\nvar select_default = /*#__PURE__*/__webpack_require__.n(src_select);\n;// CONCATENATED MODULE: ./src/common/command.js\n/**\n * Executes a given operation type.\n * @param {String} type\n * @return {Boolean}\n */\nfunction command(type) {\n try {\n return document.execCommand(type);\n } catch (err) {\n return false;\n }\n}\n;// CONCATENATED MODULE: ./src/actions/cut.js\n\n\n/**\n * Cut action wrapper.\n * @param {String|HTMLElement} target\n * @return {String}\n */\n\nvar ClipboardActionCut = function ClipboardActionCut(target) {\n var selectedText = select_default()(target);\n command('cut');\n return selectedText;\n};\n\n/* harmony default export */ var actions_cut = (ClipboardActionCut);\n;// CONCATENATED MODULE: ./src/common/create-fake-element.js\n/**\n * Creates a fake textarea element with a value.\n * @param {String} value\n * @return {HTMLElement}\n */\nfunction createFakeElement(value) {\n var isRTL = document.documentElement.getAttribute('dir') === 'rtl';\n var fakeElement = document.createElement('textarea'); // Prevent zooming on iOS\n\n fakeElement.style.fontSize = '12pt'; // Reset box model\n\n fakeElement.style.border = '0';\n fakeElement.style.padding = '0';\n fakeElement.style.margin = '0'; // Move element out of screen horizontally\n\n fakeElement.style.position = 'absolute';\n fakeElement.style[isRTL ? 'right' : 'left'] = '-9999px'; // Move element to the same position vertically\n\n var yPosition = window.pageYOffset || document.documentElement.scrollTop;\n fakeElement.style.top = \"\".concat(yPosition, \"px\");\n fakeElement.setAttribute('readonly', '');\n fakeElement.value = value;\n return fakeElement;\n}\n;// CONCATENATED MODULE: ./src/actions/copy.js\n\n\n\n/**\n * Create fake copy action wrapper using a fake element.\n * @param {String} target\n * @param {Object} options\n * @return {String}\n */\n\nvar fakeCopyAction = function fakeCopyAction(value, options) {\n var fakeElement = createFakeElement(value);\n options.container.appendChild(fakeElement);\n var selectedText = select_default()(fakeElement);\n command('copy');\n fakeElement.remove();\n return selectedText;\n};\n/**\n * Copy action wrapper.\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @return {String}\n */\n\n\nvar ClipboardActionCopy = function ClipboardActionCopy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n var selectedText = '';\n\n if (typeof target === 'string') {\n selectedText = fakeCopyAction(target, options);\n } else if (target instanceof HTMLInputElement && !['text', 'search', 'url', 'tel', 'password'].includes(target === null || target === void 0 ? void 0 : target.type)) {\n // If input type doesn't support `setSelectionRange`. Simulate it. https://developer.mozilla.org/en-US/docs/Web/API/HTMLInputElement/setSelectionRange\n selectedText = fakeCopyAction(target.value, options);\n } else {\n selectedText = select_default()(target);\n command('copy');\n }\n\n return selectedText;\n};\n\n/* harmony default export */ var actions_copy = (ClipboardActionCopy);\n;// CONCATENATED MODULE: ./src/actions/default.js\nfunction _typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return _typeof(obj); }\n\n\n\n/**\n * Inner function which performs selection from either `text` or `target`\n * properties and then executes copy or cut operations.\n * @param {Object} options\n */\n\nvar ClipboardActionDefault = function ClipboardActionDefault() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n // Defines base properties passed from constructor.\n var _options$action = options.action,\n action = _options$action === void 0 ? 'copy' : _options$action,\n container = options.container,\n target = options.target,\n text = options.text; // Sets the `action` to be performed which can be either 'copy' or 'cut'.\n\n if (action !== 'copy' && action !== 'cut') {\n throw new Error('Invalid \"action\" value, use either \"copy\" or \"cut\"');\n } // Sets the `target` property using an element that will be have its content copied.\n\n\n if (target !== undefined) {\n if (target && _typeof(target) === 'object' && target.nodeType === 1) {\n if (action === 'copy' && target.hasAttribute('disabled')) {\n throw new Error('Invalid \"target\" attribute. Please use \"readonly\" instead of \"disabled\" attribute');\n }\n\n if (action === 'cut' && (target.hasAttribute('readonly') || target.hasAttribute('disabled'))) {\n throw new Error('Invalid \"target\" attribute. You can\\'t cut text from elements with \"readonly\" or \"disabled\" attributes');\n }\n } else {\n throw new Error('Invalid \"target\" value, use a valid Element');\n }\n } // Define selection strategy based on `text` property.\n\n\n if (text) {\n return actions_copy(text, {\n container: container\n });\n } // Defines which selection strategy based on `target` property.\n\n\n if (target) {\n return action === 'cut' ? actions_cut(target) : actions_copy(target, {\n container: container\n });\n }\n};\n\n/* harmony default export */ var actions_default = (ClipboardActionDefault);\n;// CONCATENATED MODULE: ./src/clipboard.js\nfunction clipboard_typeof(obj) { \"@babel/helpers - typeof\"; if (typeof Symbol === \"function\" && typeof Symbol.iterator === \"symbol\") { clipboard_typeof = function _typeof(obj) { return typeof obj; }; } else { clipboard_typeof = function _typeof(obj) { return obj && typeof Symbol === \"function\" && obj.constructor === Symbol && obj !== Symbol.prototype ? \"symbol\" : typeof obj; }; } return clipboard_typeof(obj); }\n\nfunction _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError(\"Cannot call a class as a function\"); } }\n\nfunction _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if (\"value\" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } }\n\nfunction _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; }\n\nfunction _inherits(subClass, superClass) { if (typeof superClass !== \"function\" && superClass !== null) { throw new TypeError(\"Super expression must either be null or a function\"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); }\n\nfunction _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); }\n\nfunction _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; }\n\nfunction _possibleConstructorReturn(self, call) { if (call && (clipboard_typeof(call) === \"object\" || typeof call === \"function\")) { return call; } return _assertThisInitialized(self); }\n\nfunction _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError(\"this hasn't been initialised - super() hasn't been called\"); } return self; }\n\nfunction _isNativeReflectConstruct() { if (typeof Reflect === \"undefined\" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === \"function\") return true; try { Date.prototype.toString.call(Reflect.construct(Date, [], function () {})); return true; } catch (e) { return false; } }\n\nfunction _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); }\n\n\n\n\n\n\n/**\n * Helper function to retrieve attribute value.\n * @param {String} suffix\n * @param {Element} element\n */\n\nfunction getAttributeValue(suffix, element) {\n var attribute = \"data-clipboard-\".concat(suffix);\n\n if (!element.hasAttribute(attribute)) {\n return;\n }\n\n return element.getAttribute(attribute);\n}\n/**\n * Base class which takes one or more elements, adds event listeners to them,\n * and instantiates a new `ClipboardAction` on each click.\n */\n\n\nvar Clipboard = /*#__PURE__*/function (_Emitter) {\n _inherits(Clipboard, _Emitter);\n\n var _super = _createSuper(Clipboard);\n\n /**\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n * @param {Object} options\n */\n function Clipboard(trigger, options) {\n var _this;\n\n _classCallCheck(this, Clipboard);\n\n _this = _super.call(this);\n\n _this.resolveOptions(options);\n\n _this.listenClick(trigger);\n\n return _this;\n }\n /**\n * Defines if attributes would be resolved using internal setter functions\n * or custom functions that were passed in the constructor.\n * @param {Object} options\n */\n\n\n _createClass(Clipboard, [{\n key: \"resolveOptions\",\n value: function resolveOptions() {\n var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};\n this.action = typeof options.action === 'function' ? options.action : this.defaultAction;\n this.target = typeof options.target === 'function' ? options.target : this.defaultTarget;\n this.text = typeof options.text === 'function' ? options.text : this.defaultText;\n this.container = clipboard_typeof(options.container) === 'object' ? options.container : document.body;\n }\n /**\n * Adds a click event listener to the passed trigger.\n * @param {String|HTMLElement|HTMLCollection|NodeList} trigger\n */\n\n }, {\n key: \"listenClick\",\n value: function listenClick(trigger) {\n var _this2 = this;\n\n this.listener = listen_default()(trigger, 'click', function (e) {\n return _this2.onClick(e);\n });\n }\n /**\n * Defines a new `ClipboardAction` on each click event.\n * @param {Event} e\n */\n\n }, {\n key: \"onClick\",\n value: function onClick(e) {\n var trigger = e.delegateTarget || e.currentTarget;\n var action = this.action(trigger) || 'copy';\n var text = actions_default({\n action: action,\n container: this.container,\n target: this.target(trigger),\n text: this.text(trigger)\n }); // Fires an event based on the copy operation result.\n\n this.emit(text ? 'success' : 'error', {\n action: action,\n text: text,\n trigger: trigger,\n clearSelection: function clearSelection() {\n if (trigger) {\n trigger.focus();\n }\n\n window.getSelection().removeAllRanges();\n }\n });\n }\n /**\n * Default `action` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultAction\",\n value: function defaultAction(trigger) {\n return getAttributeValue('action', trigger);\n }\n /**\n * Default `target` lookup function.\n * @param {Element} trigger\n */\n\n }, {\n key: \"defaultTarget\",\n value: function defaultTarget(trigger) {\n var selector = getAttributeValue('target', trigger);\n\n if (selector) {\n return document.querySelector(selector);\n }\n }\n /**\n * Allow fire programmatically a copy action\n * @param {String|HTMLElement} target\n * @param {Object} options\n * @returns Text copied.\n */\n\n }, {\n key: \"defaultText\",\n\n /**\n * Default `text` lookup function.\n * @param {Element} trigger\n */\n value: function defaultText(trigger) {\n return getAttributeValue('text', trigger);\n }\n /**\n * Destroy lifecycle.\n */\n\n }, {\n key: \"destroy\",\n value: function destroy() {\n this.listener.destroy();\n }\n }], [{\n key: \"copy\",\n value: function copy(target) {\n var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {\n container: document.body\n };\n return actions_copy(target, options);\n }\n /**\n * Allow fire programmatically a cut action\n * @param {String|HTMLElement} target\n * @returns Text cutted.\n */\n\n }, {\n key: \"cut\",\n value: function cut(target) {\n return actions_cut(target);\n }\n /**\n * Returns the support of the given action, or all actions if no action is\n * given.\n * @param {String} [action]\n */\n\n }, {\n key: \"isSupported\",\n value: function isSupported() {\n var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ['copy', 'cut'];\n var actions = typeof action === 'string' ? [action] : action;\n var support = !!document.queryCommandSupported;\n actions.forEach(function (action) {\n support = support && !!document.queryCommandSupported(action);\n });\n return support;\n }\n }]);\n\n return Clipboard;\n}((tiny_emitter_default()));\n\n/* harmony default export */ var clipboard = (Clipboard);\n\n/***/ }),\n\n/***/ 828:\n/***/ (function(module) {\n\nvar DOCUMENT_NODE_TYPE = 9;\n\n/**\n * A polyfill for Element.matches()\n */\nif (typeof Element !== 'undefined' && !Element.prototype.matches) {\n var proto = Element.prototype;\n\n proto.matches = proto.matchesSelector ||\n proto.mozMatchesSelector ||\n proto.msMatchesSelector ||\n proto.oMatchesSelector ||\n proto.webkitMatchesSelector;\n}\n\n/**\n * Finds the closest parent that matches a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @return {Function}\n */\nfunction closest (element, selector) {\n while (element && element.nodeType !== DOCUMENT_NODE_TYPE) {\n if (typeof element.matches === 'function' &&\n element.matches(selector)) {\n return element;\n }\n element = element.parentNode;\n }\n}\n\nmodule.exports = closest;\n\n\n/***/ }),\n\n/***/ 438:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar closest = __webpack_require__(828);\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction _delegate(element, selector, type, callback, useCapture) {\n var listenerFn = listener.apply(this, arguments);\n\n element.addEventListener(type, listenerFn, useCapture);\n\n return {\n destroy: function() {\n element.removeEventListener(type, listenerFn, useCapture);\n }\n }\n}\n\n/**\n * Delegates event to a selector.\n *\n * @param {Element|String|Array} [elements]\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @param {Boolean} useCapture\n * @return {Object}\n */\nfunction delegate(elements, selector, type, callback, useCapture) {\n // Handle the regular Element usage\n if (typeof elements.addEventListener === 'function') {\n return _delegate.apply(null, arguments);\n }\n\n // Handle Element-less usage, it defaults to global delegation\n if (typeof type === 'function') {\n // Use `document` as the first parameter, then apply arguments\n // This is a short way to .unshift `arguments` without running into deoptimizations\n return _delegate.bind(null, document).apply(null, arguments);\n }\n\n // Handle Selector-based usage\n if (typeof elements === 'string') {\n elements = document.querySelectorAll(elements);\n }\n\n // Handle Array-like based usage\n return Array.prototype.map.call(elements, function (element) {\n return _delegate(element, selector, type, callback, useCapture);\n });\n}\n\n/**\n * Finds closest match and invokes callback.\n *\n * @param {Element} element\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Function}\n */\nfunction listener(element, selector, type, callback) {\n return function(e) {\n e.delegateTarget = closest(e.target, selector);\n\n if (e.delegateTarget) {\n callback.call(element, e);\n }\n }\n}\n\nmodule.exports = delegate;\n\n\n/***/ }),\n\n/***/ 879:\n/***/ (function(__unused_webpack_module, exports) {\n\n/**\n * Check if argument is a HTML element.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.node = function(value) {\n return value !== undefined\n && value instanceof HTMLElement\n && value.nodeType === 1;\n};\n\n/**\n * Check if argument is a list of HTML elements.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.nodeList = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return value !== undefined\n && (type === '[object NodeList]' || type === '[object HTMLCollection]')\n && ('length' in value)\n && (value.length === 0 || exports.node(value[0]));\n};\n\n/**\n * Check if argument is a string.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.string = function(value) {\n return typeof value === 'string'\n || value instanceof String;\n};\n\n/**\n * Check if argument is a function.\n *\n * @param {Object} value\n * @return {Boolean}\n */\nexports.fn = function(value) {\n var type = Object.prototype.toString.call(value);\n\n return type === '[object Function]';\n};\n\n\n/***/ }),\n\n/***/ 370:\n/***/ (function(module, __unused_webpack_exports, __webpack_require__) {\n\nvar is = __webpack_require__(879);\nvar delegate = __webpack_require__(438);\n\n/**\n * Validates all params and calls the right\n * listener function based on its target type.\n *\n * @param {String|HTMLElement|HTMLCollection|NodeList} target\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listen(target, type, callback) {\n if (!target && !type && !callback) {\n throw new Error('Missing required arguments');\n }\n\n if (!is.string(type)) {\n throw new TypeError('Second argument must be a String');\n }\n\n if (!is.fn(callback)) {\n throw new TypeError('Third argument must be a Function');\n }\n\n if (is.node(target)) {\n return listenNode(target, type, callback);\n }\n else if (is.nodeList(target)) {\n return listenNodeList(target, type, callback);\n }\n else if (is.string(target)) {\n return listenSelector(target, type, callback);\n }\n else {\n throw new TypeError('First argument must be a String, HTMLElement, HTMLCollection, or NodeList');\n }\n}\n\n/**\n * Adds an event listener to a HTML element\n * and returns a remove listener function.\n *\n * @param {HTMLElement} node\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNode(node, type, callback) {\n node.addEventListener(type, callback);\n\n return {\n destroy: function() {\n node.removeEventListener(type, callback);\n }\n }\n}\n\n/**\n * Add an event listener to a list of HTML elements\n * and returns a remove listener function.\n *\n * @param {NodeList|HTMLCollection} nodeList\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenNodeList(nodeList, type, callback) {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.addEventListener(type, callback);\n });\n\n return {\n destroy: function() {\n Array.prototype.forEach.call(nodeList, function(node) {\n node.removeEventListener(type, callback);\n });\n }\n }\n}\n\n/**\n * Add an event listener to a selector\n * and returns a remove listener function.\n *\n * @param {String} selector\n * @param {String} type\n * @param {Function} callback\n * @return {Object}\n */\nfunction listenSelector(selector, type, callback) {\n return delegate(document.body, selector, type, callback);\n}\n\nmodule.exports = listen;\n\n\n/***/ }),\n\n/***/ 817:\n/***/ (function(module) {\n\nfunction select(element) {\n var selectedText;\n\n if (element.nodeName === 'SELECT') {\n element.focus();\n\n selectedText = element.value;\n }\n else if (element.nodeName === 'INPUT' || element.nodeName === 'TEXTAREA') {\n var isReadOnly = element.hasAttribute('readonly');\n\n if (!isReadOnly) {\n element.setAttribute('readonly', '');\n }\n\n element.select();\n element.setSelectionRange(0, element.value.length);\n\n if (!isReadOnly) {\n element.removeAttribute('readonly');\n }\n\n selectedText = element.value;\n }\n else {\n if (element.hasAttribute('contenteditable')) {\n element.focus();\n }\n\n var selection = window.getSelection();\n var range = document.createRange();\n\n range.selectNodeContents(element);\n selection.removeAllRanges();\n selection.addRange(range);\n\n selectedText = selection.toString();\n }\n\n return selectedText;\n}\n\nmodule.exports = select;\n\n\n/***/ }),\n\n/***/ 279:\n/***/ (function(module) {\n\nfunction E () {\n // Keep this empty so it's easier to inherit from\n // (via https://github.com/lipsmack from https://github.com/scottcorgan/tiny-emitter/issues/3)\n}\n\nE.prototype = {\n on: function (name, callback, ctx) {\n var e = this.e || (this.e = {});\n\n (e[name] || (e[name] = [])).push({\n fn: callback,\n ctx: ctx\n });\n\n return this;\n },\n\n once: function (name, callback, ctx) {\n var self = this;\n function listener () {\n self.off(name, listener);\n callback.apply(ctx, arguments);\n };\n\n listener._ = callback\n return this.on(name, listener, ctx);\n },\n\n emit: function (name) {\n var data = [].slice.call(arguments, 1);\n var evtArr = ((this.e || (this.e = {}))[name] || []).slice();\n var i = 0;\n var len = evtArr.length;\n\n for (i; i < len; i++) {\n evtArr[i].fn.apply(evtArr[i].ctx, data);\n }\n\n return this;\n },\n\n off: function (name, callback) {\n var e = this.e || (this.e = {});\n var evts = e[name];\n var liveEvents = [];\n\n if (evts && callback) {\n for (var i = 0, len = evts.length; i < len; i++) {\n if (evts[i].fn !== callback && evts[i].fn._ !== callback)\n liveEvents.push(evts[i]);\n }\n }\n\n // Remove event from queue to prevent memory leak\n // Suggested by https://github.com/lazd\n // Ref: https://github.com/scottcorgan/tiny-emitter/commit/c6ebfaa9bc973b33d110a84a307742b7cf94c953#commitcomment-5024910\n\n (liveEvents.length)\n ? e[name] = liveEvents\n : delete e[name];\n\n return this;\n }\n};\n\nmodule.exports = E;\nmodule.exports.TinyEmitter = E;\n\n\n/***/ })\n\n/******/ \t});\n/************************************************************************/\n/******/ \t// The module cache\n/******/ \tvar __webpack_module_cache__ = {};\n/******/ \t\n/******/ \t// The require function\n/******/ \tfunction __webpack_require__(moduleId) {\n/******/ \t\t// Check if module is in cache\n/******/ \t\tif(__webpack_module_cache__[moduleId]) {\n/******/ \t\t\treturn __webpack_module_cache__[moduleId].exports;\n/******/ \t\t}\n/******/ \t\t// Create a new module (and put it into the cache)\n/******/ \t\tvar module = __webpack_module_cache__[moduleId] = {\n/******/ \t\t\t// no module.id needed\n/******/ \t\t\t// no module.loaded needed\n/******/ \t\t\texports: {}\n/******/ \t\t};\n/******/ \t\n/******/ \t\t// Execute the module function\n/******/ \t\t__webpack_modules__[moduleId](module, module.exports, __webpack_require__);\n/******/ \t\n/******/ \t\t// Return the exports of the module\n/******/ \t\treturn module.exports;\n/******/ \t}\n/******/ \t\n/************************************************************************/\n/******/ \t/* webpack/runtime/compat get default export */\n/******/ \t!function() {\n/******/ \t\t// getDefaultExport function for compatibility with non-harmony modules\n/******/ \t\t__webpack_require__.n = function(module) {\n/******/ \t\t\tvar getter = module && module.__esModule ?\n/******/ \t\t\t\tfunction() { return module['default']; } :\n/******/ \t\t\t\tfunction() { return module; };\n/******/ \t\t\t__webpack_require__.d(getter, { a: getter });\n/******/ \t\t\treturn getter;\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/define property getters */\n/******/ \t!function() {\n/******/ \t\t// define getter functions for harmony exports\n/******/ \t\t__webpack_require__.d = function(exports, definition) {\n/******/ \t\t\tfor(var key in definition) {\n/******/ \t\t\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n/******/ \t\t\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n/******/ \t\t\t\t}\n/******/ \t\t\t}\n/******/ \t\t};\n/******/ \t}();\n/******/ \t\n/******/ \t/* webpack/runtime/hasOwnProperty shorthand */\n/******/ \t!function() {\n/******/ \t\t__webpack_require__.o = function(obj, prop) { return Object.prototype.hasOwnProperty.call(obj, prop); }\n/******/ \t}();\n/******/ \t\n/************************************************************************/\n/******/ \t// module exports must be returned from runtime so entry inlining is disabled\n/******/ \t// startup\n/******/ \t// Load entry module and return exports\n/******/ \treturn __webpack_require__(686);\n/******/ })()\n.default;\n});", "/*!\n * escape-html\n * Copyright(c) 2012-2013 TJ Holowaychuk\n * Copyright(c) 2015 Andreas Lubbe\n * Copyright(c) 2015 Tiancheng \"Timothy\" Gu\n * MIT Licensed\n */\n\n'use strict';\n\n/**\n * Module variables.\n * @private\n */\n\nvar matchHtmlRegExp = /[\"'&<>]/;\n\n/**\n * Module exports.\n * @public\n */\n\nmodule.exports = escapeHtml;\n\n/**\n * Escape special characters in the given string of html.\n *\n * @param {string} string The string to escape for inserting into HTML\n * @return {string}\n * @public\n */\n\nfunction escapeHtml(string) {\n var str = '' + string;\n var match = matchHtmlRegExp.exec(str);\n\n if (!match) {\n return str;\n }\n\n var escape;\n var html = '';\n var index = 0;\n var lastIndex = 0;\n\n for (index = match.index; index < str.length; index++) {\n switch (str.charCodeAt(index)) {\n case 34: // \"\n escape = '"';\n break;\n case 38: // &\n escape = '&';\n break;\n case 39: // '\n escape = ''';\n break;\n case 60: // <\n escape = '<';\n break;\n case 62: // >\n escape = '>';\n break;\n default:\n continue;\n }\n\n if (lastIndex !== index) {\n html += str.substring(lastIndex, index);\n }\n\n lastIndex = index + 1;\n html += escape;\n }\n\n return lastIndex !== index\n ? html + str.substring(lastIndex, index)\n : html;\n}\n", "/*\n * Copyright (c) 2016-2023 Martin Donath \n *\n * Permission is hereby granted, free of charge, to any person obtaining a copy\n * of this software and associated documentation files (the \"Software\"), to\n * deal in the Software without restriction, including without limitation the\n * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or\n * sell copies of the Software, and to permit persons to whom the Software is\n * furnished to do so, subject to the following conditions:\n *\n * The above copyright notice and this permission notice shall be included in\n * all copies or substantial portions of the Software.\n *\n * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE\n * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\n * IN THE SOFTWARE.\n */\n\nimport \"focus-visible\"\n\nimport {\n EMPTY,\n NEVER,\n Observable,\n Subject,\n defer,\n delay,\n filter,\n map,\n merge,\n mergeWith,\n shareReplay,\n switchMap\n} from \"rxjs\"\n\nimport { configuration, feature } from \"./_\"\nimport {\n at,\n getActiveElement,\n getOptionalElement,\n requestJSON,\n setLocation,\n setToggle,\n watchDocument,\n watchKeyboard,\n watchLocation,\n watchLocationTarget,\n watchMedia,\n watchPrint,\n watchScript,\n watchViewport\n} from \"./browser\"\nimport {\n getComponentElement,\n getComponentElements,\n mountAnnounce,\n mountBackToTop,\n mountConsent,\n mountContent,\n mountDialog,\n mountHeader,\n mountHeaderTitle,\n mountPalette,\n mountProgress,\n mountSearch,\n mountSearchHiglight,\n mountSidebar,\n mountSource,\n mountTableOfContents,\n mountTabs,\n watchHeader,\n watchMain\n} from \"./components\"\nimport {\n SearchIndex,\n setupClipboardJS,\n setupInstantNavigation,\n setupVersionSelector\n} from \"./integrations\"\nimport {\n patchIndeterminate,\n patchScrollfix,\n patchScrolllock\n} from \"./patches\"\nimport \"./polyfills\"\n\n/* ----------------------------------------------------------------------------\n * Functions - @todo refactor\n * ------------------------------------------------------------------------- */\n\n/**\n * Fetch search index\n *\n * @returns Search index observable\n */\nfunction fetchSearchIndex(): Observable {\n if (location.protocol === \"file:\") {\n return watchScript(\n `${new URL(\"search/search_index.js\", config.base)}`\n )\n .pipe(\n // @ts-ignore - @todo fix typings\n map(() => __index),\n shareReplay(1)\n )\n } else {\n return requestJSON(\n new URL(\"search/search_index.json\", config.base)\n )\n }\n}\n\n/* ----------------------------------------------------------------------------\n * Application\n * ------------------------------------------------------------------------- */\n\n/* Yay, JavaScript is available */\ndocument.documentElement.classList.remove(\"no-js\")\ndocument.documentElement.classList.add(\"js\")\n\n/* Set up navigation observables and subjects */\nconst document$ = watchDocument()\nconst location$ = watchLocation()\nconst target$ = watchLocationTarget(location$)\nconst keyboard$ = watchKeyboard()\n\n/* Set up media observables */\nconst viewport$ = watchViewport()\nconst tablet$ = watchMedia(\"(min-width: 960px)\")\nconst screen$ = watchMedia(\"(min-width: 1220px)\")\nconst print$ = watchPrint()\n\n/* Retrieve search index, if search is enabled */\nconst config = configuration()\nconst index$ = document.forms.namedItem(\"search\")\n ? fetchSearchIndex()\n : NEVER\n\n/* Set up Clipboard.js integration */\nconst alert$ = new Subject()\nsetupClipboardJS({ alert$ })\n\n/* Set up progress indicator */\nconst progress$ = new Subject()\n\n/* Set up instant navigation, if enabled */\nif (feature(\"navigation.instant\"))\n setupInstantNavigation({ location$, viewport$, progress$ })\n .subscribe(document$)\n\n/* Set up version selector */\nif (config.version?.provider === \"mike\")\n setupVersionSelector({ document$ })\n\n/* Always close drawer and search on navigation */\nmerge(location$, target$)\n .pipe(\n delay(125)\n )\n .subscribe(() => {\n setToggle(\"drawer\", false)\n setToggle(\"search\", false)\n })\n\n/* Set up global keyboard handlers */\nkeyboard$\n .pipe(\n filter(({ mode }) => mode === \"global\")\n )\n .subscribe(key => {\n switch (key.type) {\n\n /* Go to previous page */\n case \"p\":\n case \",\":\n const prev = getOptionalElement(\"link[rel=prev]\")\n if (typeof prev !== \"undefined\")\n setLocation(prev)\n break\n\n /* Go to next page */\n case \"n\":\n case \".\":\n const next = getOptionalElement(\"link[rel=next]\")\n if (typeof next !== \"undefined\")\n setLocation(next)\n break\n\n /* Expand navigation, see https://bit.ly/3ZjG5io */\n case \"Enter\":\n const active = getActiveElement()\n if (active instanceof HTMLLabelElement)\n active.click()\n }\n })\n\n/* Set up patches */\npatchIndeterminate({ document$, tablet$ })\npatchScrollfix({ document$ })\npatchScrolllock({ viewport$, tablet$ })\n\n/* Set up header and main area observable */\nconst header$ = watchHeader(getComponentElement(\"header\"), { viewport$ })\nconst main$ = document$\n .pipe(\n map(() => getComponentElement(\"main\")),\n switchMap(el => watchMain(el, { viewport$, header$ })),\n shareReplay(1)\n )\n\n/* Set up control component observables */\nconst control$ = merge(\n\n /* Consent */\n ...getComponentElements(\"consent\")\n .map(el => mountConsent(el, { target$ })),\n\n /* Dialog */\n ...getComponentElements(\"dialog\")\n .map(el => mountDialog(el, { alert$ })),\n\n /* Header */\n ...getComponentElements(\"header\")\n .map(el => mountHeader(el, { viewport$, header$, main$ })),\n\n /* Color palette */\n ...getComponentElements(\"palette\")\n .map(el => mountPalette(el)),\n\n /* Progress bar */\n ...getComponentElements(\"progress\")\n .map(el => mountProgress(el, { progress$ })),\n\n /* Search */\n ...getComponentElements(\"search\")\n .map(el => mountSearch(el, { index$, keyboard$ })),\n\n /* Repository information */\n ...getComponentElements(\"source\")\n .map(el => mountSource(el))\n)\n\n/* Set up content component observables */\nconst content$ = defer(() => merge(\n\n /* Announcement bar */\n ...getComponentElements(\"announce\")\n .map(el => mountAnnounce(el)),\n\n /* Content */\n ...getComponentElements(\"content\")\n .map(el => mountContent(el, { viewport$, target$, print$ })),\n\n /* Search highlighting */\n ...getComponentElements(\"content\")\n .map(el => feature(\"search.highlight\")\n ? mountSearchHiglight(el, { index$, location$ })\n : EMPTY\n ),\n\n /* Header title */\n ...getComponentElements(\"header-title\")\n .map(el => mountHeaderTitle(el, { viewport$, header$ })),\n\n /* Sidebar */\n ...getComponentElements(\"sidebar\")\n .map(el => el.getAttribute(\"data-md-type\") === \"navigation\"\n ? at(screen$, () => mountSidebar(el, { viewport$, header$, main$ }))\n : at(tablet$, () => mountSidebar(el, { viewport$, header$, main$ }))\n ),\n\n /* Navigation tabs */\n ...getComponentElements(\"tabs\")\n .map(el => mountTabs(el, { viewport$, header$ })),\n\n /* Table of contents */\n ...getComponentElements(\"toc\")\n .map(el => mountTableOfContents(el, {\n viewport$, header$, main$, target$\n })),\n\n /* Back-to-top button */\n ...getComponentElements(\"top\")\n .map(el => mountBackToTop(el, { viewport$, header$, main$, target$ }))\n))\n\n/* Set up component observables */\nconst component$ = document$\n .pipe(\n switchMap(() => content$),\n mergeWith(control$),\n shareReplay(1)\n )\n\n/* Subscribe to all components */\ncomponent$.subscribe()\n\n/* ----------------------------------------------------------------------------\n * Exports\n * ------------------------------------------------------------------------- */\n\nwindow.document$ = document$ /* Document observable */\nwindow.location$ = location$ /* Location subject */\nwindow.target$ = target$ /* Location target observable */\nwindow.keyboard$ = keyboard$ /* Keyboard observable */\nwindow.viewport$ = viewport$ /* Viewport observable */\nwindow.tablet$ = tablet$ /* Media tablet observable */\nwindow.screen$ = screen$ /* Media screen observable */\nwindow.print$ = print$ /* Media print observable */\nwindow.alert$ = alert$ /* Alert subject */\nwindow.progress$ = progress$ /* Progress indicator subject */\nwindow.component$ = component$ /* Component observable */\n", "/*! *****************************************************************************\r\nCopyright (c) Microsoft Corporation.\r\n\r\nPermission to use, copy, modify, and/or distribute this software for any\r\npurpose with or without fee is hereby granted.\r\n\r\nTHE SOFTWARE IS PROVIDED \"AS IS\" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH\r\nREGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY\r\nAND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,\r\nINDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM\r\nLOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR\r\nOTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR\r\nPERFORMANCE OF THIS SOFTWARE.\r\n***************************************************************************** */\r\n/* global Reflect, Promise */\r\n\r\nvar extendStatics = function(d, b) {\r\n extendStatics = Object.setPrototypeOf ||\r\n ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||\r\n function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };\r\n return extendStatics(d, b);\r\n};\r\n\r\nexport function __extends(d, b) {\r\n if (typeof b !== \"function\" && b !== null)\r\n throw new TypeError(\"Class extends value \" + String(b) + \" is not a constructor or null\");\r\n extendStatics(d, b);\r\n function __() { this.constructor = d; }\r\n d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());\r\n}\r\n\r\nexport var __assign = function() {\r\n __assign = Object.assign || function __assign(t) {\r\n for (var s, i = 1, n = arguments.length; i < n; i++) {\r\n s = arguments[i];\r\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p];\r\n }\r\n return t;\r\n }\r\n return __assign.apply(this, arguments);\r\n}\r\n\r\nexport function __rest(s, e) {\r\n var t = {};\r\n for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)\r\n t[p] = s[p];\r\n if (s != null && typeof Object.getOwnPropertySymbols === \"function\")\r\n for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {\r\n if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))\r\n t[p[i]] = s[p[i]];\r\n }\r\n return t;\r\n}\r\n\r\nexport function __decorate(decorators, target, key, desc) {\r\n var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;\r\n if (typeof Reflect === \"object\" && typeof Reflect.decorate === \"function\") r = Reflect.decorate(decorators, target, key, desc);\r\n else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;\r\n return c > 3 && r && Object.defineProperty(target, key, r), r;\r\n}\r\n\r\nexport function __param(paramIndex, decorator) {\r\n return function (target, key) { decorator(target, key, paramIndex); }\r\n}\r\n\r\nexport function __metadata(metadataKey, metadataValue) {\r\n if (typeof Reflect === \"object\" && typeof Reflect.metadata === \"function\") return Reflect.metadata(metadataKey, metadataValue);\r\n}\r\n\r\nexport function __awaiter(thisArg, _arguments, P, generator) {\r\n function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }\r\n return new (P || (P = Promise))(function (resolve, reject) {\r\n function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }\r\n function rejected(value) { try { step(generator[\"throw\"](value)); } catch (e) { reject(e); } }\r\n function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }\r\n step((generator = generator.apply(thisArg, _arguments || [])).next());\r\n });\r\n}\r\n\r\nexport function __generator(thisArg, body) {\r\n var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;\r\n return g = { next: verb(0), \"throw\": verb(1), \"return\": verb(2) }, typeof Symbol === \"function\" && (g[Symbol.iterator] = function() { return this; }), g;\r\n function verb(n) { return function (v) { return step([n, v]); }; }\r\n function step(op) {\r\n if (f) throw new TypeError(\"Generator is already executing.\");\r\n while (_) try {\r\n if (f = 1, y && (t = op[0] & 2 ? y[\"return\"] : op[0] ? y[\"throw\"] || ((t = y[\"return\"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;\r\n if (y = 0, t) op = [op[0] & 2, t.value];\r\n switch (op[0]) {\r\n case 0: case 1: t = op; break;\r\n case 4: _.label++; return { value: op[1], done: false };\r\n case 5: _.label++; y = op[1]; op = [0]; continue;\r\n case 7: op = _.ops.pop(); _.trys.pop(); continue;\r\n default:\r\n if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }\r\n if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }\r\n if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }\r\n if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }\r\n if (t[2]) _.ops.pop();\r\n _.trys.pop(); continue;\r\n }\r\n op = body.call(thisArg, _);\r\n } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }\r\n if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };\r\n }\r\n}\r\n\r\nexport var __createBinding = Object.create ? (function(o, m, k, k2) {\r\n if (k2 === undefined) k2 = k;\r\n Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });\r\n}) : (function(o, m, k, k2) {\r\n if (k2 === undefined) k2 = k;\r\n o[k2] = m[k];\r\n});\r\n\r\nexport function __exportStar(m, o) {\r\n for (var p in m) if (p !== \"default\" && !Object.prototype.hasOwnProperty.call(o, p)) __createBinding(o, m, p);\r\n}\r\n\r\nexport function __values(o) {\r\n var s = typeof Symbol === \"function\" && Symbol.iterator, m = s && o[s], i = 0;\r\n if (m) return m.call(o);\r\n if (o && typeof o.length === \"number\") return {\r\n next: function () {\r\n if (o && i >= o.length) o = void 0;\r\n return { value: o && o[i++], done: !o };\r\n }\r\n };\r\n throw new TypeError(s ? \"Object is not iterable.\" : \"Symbol.iterator is not defined.\");\r\n}\r\n\r\nexport function __read(o, n) {\r\n var m = typeof Symbol === \"function\" && o[Symbol.iterator];\r\n if (!m) return o;\r\n var i = m.call(o), r, ar = [], e;\r\n try {\r\n while ((n === void 0 || n-- > 0) && !(r = i.next()).done) ar.push(r.value);\r\n }\r\n catch (error) { e = { error: error }; }\r\n finally {\r\n try {\r\n if (r && !r.done && (m = i[\"return\"])) m.call(i);\r\n }\r\n finally { if (e) throw e.error; }\r\n }\r\n return ar;\r\n}\r\n\r\n/** @deprecated */\r\nexport function __spread() {\r\n for (var ar = [], i = 0; i < arguments.length; i++)\r\n ar = ar.concat(__read(arguments[i]));\r\n return ar;\r\n}\r\n\r\n/** @deprecated */\r\nexport function __spreadArrays() {\r\n for (var s = 0, i = 0, il = arguments.length; i < il; i++) s += arguments[i].length;\r\n for (var r = Array(s), k = 0, i = 0; i < il; i++)\r\n for (var a = arguments[i], j = 0, jl = a.length; j < jl; j++, k++)\r\n r[k] = a[j];\r\n return r;\r\n}\r\n\r\nexport function __spreadArray(to, from, pack) {\r\n if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {\r\n if (ar || !(i in from)) {\r\n if (!ar) ar = Array.prototype.slice.call(from, 0, i);\r\n ar[i] = from[i];\r\n }\r\n }\r\n return to.concat(ar || Array.prototype.slice.call(from));\r\n}\r\n\r\nexport function __await(v) {\r\n return this instanceof __await ? (this.v = v, this) : new __await(v);\r\n}\r\n\r\nexport function __asyncGenerator(thisArg, _arguments, generator) {\r\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\r\n var g = generator.apply(thisArg, _arguments || []), i, q = [];\r\n return i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i;\r\n function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }\r\n function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }\r\n function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }\r\n function fulfill(value) { resume(\"next\", value); }\r\n function reject(value) { resume(\"throw\", value); }\r\n function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }\r\n}\r\n\r\nexport function __asyncDelegator(o) {\r\n var i, p;\r\n return i = {}, verb(\"next\"), verb(\"throw\", function (e) { throw e; }), verb(\"return\"), i[Symbol.iterator] = function () { return this; }, i;\r\n function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: n === \"return\" } : f ? f(v) : v; } : f; }\r\n}\r\n\r\nexport function __asyncValues(o) {\r\n if (!Symbol.asyncIterator) throw new TypeError(\"Symbol.asyncIterator is not defined.\");\r\n var m = o[Symbol.asyncIterator], i;\r\n return m ? m.call(o) : (o = typeof __values === \"function\" ? __values(o) : o[Symbol.iterator](), i = {}, verb(\"next\"), verb(\"throw\"), verb(\"return\"), i[Symbol.asyncIterator] = function () { return this; }, i);\r\n function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }\r\n function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }\r\n}\r\n\r\nexport function __makeTemplateObject(cooked, raw) {\r\n if (Object.defineProperty) { Object.defineProperty(cooked, \"raw\", { value: raw }); } else { cooked.raw = raw; }\r\n return cooked;\r\n};\r\n\r\nvar __setModuleDefault = Object.create ? (function(o, v) {\r\n Object.defineProperty(o, \"default\", { enumerable: true, value: v });\r\n}) : function(o, v) {\r\n o[\"default\"] = v;\r\n};\r\n\r\nexport function __importStar(mod) {\r\n if (mod && mod.__esModule) return mod;\r\n var result = {};\r\n if (mod != null) for (var k in mod) if (k !== \"default\" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);\r\n __setModuleDefault(result, mod);\r\n return result;\r\n}\r\n\r\nexport function __importDefault(mod) {\r\n return (mod && mod.__esModule) ? mod : { default: mod };\r\n}\r\n\r\nexport function __classPrivateFieldGet(receiver, state, kind, f) {\r\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a getter\");\r\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot read private member from an object whose class did not declare it\");\r\n return kind === \"m\" ? f : kind === \"a\" ? f.call(receiver) : f ? f.value : state.get(receiver);\r\n}\r\n\r\nexport function __classPrivateFieldSet(receiver, state, value, kind, f) {\r\n if (kind === \"m\") throw new TypeError(\"Private method is not writable\");\r\n if (kind === \"a\" && !f) throw new TypeError(\"Private accessor was defined without a setter\");\r\n if (typeof state === \"function\" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError(\"Cannot write private member to an object whose class did not declare it\");\r\n return (kind === \"a\" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;\r\n}\r\n", "/**\n * Returns true if the object is a function.\n * @param value The value to check\n */\nexport function isFunction(value: any): value is (...args: any[]) => any {\n return typeof value === 'function';\n}\n", "/**\n * Used to create Error subclasses until the community moves away from ES5.\n *\n * This is because compiling from TypeScript down to ES5 has issues with subclassing Errors\n * as well as other built-in types: https://github.com/Microsoft/TypeScript/issues/12123\n *\n * @param createImpl A factory function to create the actual constructor implementation. The returned\n * function should be a named function that calls `_super` internally.\n */\nexport function createErrorClass(createImpl: (_super: any) => any): T {\n const _super = (instance: any) => {\n Error.call(instance);\n instance.stack = new Error().stack;\n };\n\n const ctorFunc = createImpl(_super);\n ctorFunc.prototype = Object.create(Error.prototype);\n ctorFunc.prototype.constructor = ctorFunc;\n return ctorFunc;\n}\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface UnsubscriptionError extends Error {\n readonly errors: any[];\n}\n\nexport interface UnsubscriptionErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (errors: any[]): UnsubscriptionError;\n}\n\n/**\n * An error thrown when one or more errors have occurred during the\n * `unsubscribe` of a {@link Subscription}.\n */\nexport const UnsubscriptionError: UnsubscriptionErrorCtor = createErrorClass(\n (_super) =>\n function UnsubscriptionErrorImpl(this: any, errors: (Error | string)[]) {\n _super(this);\n this.message = errors\n ? `${errors.length} errors occurred during unsubscription:\n${errors.map((err, i) => `${i + 1}) ${err.toString()}`).join('\\n ')}`\n : '';\n this.name = 'UnsubscriptionError';\n this.errors = errors;\n }\n);\n", "/**\n * Removes an item from an array, mutating it.\n * @param arr The array to remove the item from\n * @param item The item to remove\n */\nexport function arrRemove(arr: T[] | undefined | null, item: T) {\n if (arr) {\n const index = arr.indexOf(item);\n 0 <= index && arr.splice(index, 1);\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { UnsubscriptionError } from './util/UnsubscriptionError';\nimport { SubscriptionLike, TeardownLogic, Unsubscribable } from './types';\nimport { arrRemove } from './util/arrRemove';\n\n/**\n * Represents a disposable resource, such as the execution of an Observable. A\n * Subscription has one important method, `unsubscribe`, that takes no argument\n * and just disposes the resource held by the subscription.\n *\n * Additionally, subscriptions may be grouped together through the `add()`\n * method, which will attach a child Subscription to the current Subscription.\n * When a Subscription is unsubscribed, all its children (and its grandchildren)\n * will be unsubscribed as well.\n *\n * @class Subscription\n */\nexport class Subscription implements SubscriptionLike {\n /** @nocollapse */\n public static EMPTY = (() => {\n const empty = new Subscription();\n empty.closed = true;\n return empty;\n })();\n\n /**\n * A flag to indicate whether this Subscription has already been unsubscribed.\n */\n public closed = false;\n\n private _parentage: Subscription[] | Subscription | null = null;\n\n /**\n * The list of registered finalizers to execute upon unsubscription. Adding and removing from this\n * list occurs in the {@link #add} and {@link #remove} methods.\n */\n private _finalizers: Exclude[] | null = null;\n\n /**\n * @param initialTeardown A function executed first as part of the finalization\n * process that is kicked off when {@link #unsubscribe} is called.\n */\n constructor(private initialTeardown?: () => void) {}\n\n /**\n * Disposes the resources held by the subscription. May, for instance, cancel\n * an ongoing Observable execution or cancel any other type of work that\n * started when the Subscription was created.\n * @return {void}\n */\n unsubscribe(): void {\n let errors: any[] | undefined;\n\n if (!this.closed) {\n this.closed = true;\n\n // Remove this from it's parents.\n const { _parentage } = this;\n if (_parentage) {\n this._parentage = null;\n if (Array.isArray(_parentage)) {\n for (const parent of _parentage) {\n parent.remove(this);\n }\n } else {\n _parentage.remove(this);\n }\n }\n\n const { initialTeardown: initialFinalizer } = this;\n if (isFunction(initialFinalizer)) {\n try {\n initialFinalizer();\n } catch (e) {\n errors = e instanceof UnsubscriptionError ? e.errors : [e];\n }\n }\n\n const { _finalizers } = this;\n if (_finalizers) {\n this._finalizers = null;\n for (const finalizer of _finalizers) {\n try {\n execFinalizer(finalizer);\n } catch (err) {\n errors = errors ?? [];\n if (err instanceof UnsubscriptionError) {\n errors = [...errors, ...err.errors];\n } else {\n errors.push(err);\n }\n }\n }\n }\n\n if (errors) {\n throw new UnsubscriptionError(errors);\n }\n }\n }\n\n /**\n * Adds a finalizer to this subscription, so that finalization will be unsubscribed/called\n * when this subscription is unsubscribed. If this subscription is already {@link #closed},\n * because it has already been unsubscribed, then whatever finalizer is passed to it\n * will automatically be executed (unless the finalizer itself is also a closed subscription).\n *\n * Closed Subscriptions cannot be added as finalizers to any subscription. Adding a closed\n * subscription to a any subscription will result in no operation. (A noop).\n *\n * Adding a subscription to itself, or adding `null` or `undefined` will not perform any\n * operation at all. (A noop).\n *\n * `Subscription` instances that are added to this instance will automatically remove themselves\n * if they are unsubscribed. Functions and {@link Unsubscribable} objects that you wish to remove\n * will need to be removed manually with {@link #remove}\n *\n * @param teardown The finalization logic to add to this subscription.\n */\n add(teardown: TeardownLogic): void {\n // Only add the finalizer if it's not undefined\n // and don't add a subscription to itself.\n if (teardown && teardown !== this) {\n if (this.closed) {\n // If this subscription is already closed,\n // execute whatever finalizer is handed to it automatically.\n execFinalizer(teardown);\n } else {\n if (teardown instanceof Subscription) {\n // We don't add closed subscriptions, and we don't add the same subscription\n // twice. Subscription unsubscribe is idempotent.\n if (teardown.closed || teardown._hasParent(this)) {\n return;\n }\n teardown._addParent(this);\n }\n (this._finalizers = this._finalizers ?? []).push(teardown);\n }\n }\n }\n\n /**\n * Checks to see if a this subscription already has a particular parent.\n * This will signal that this subscription has already been added to the parent in question.\n * @param parent the parent to check for\n */\n private _hasParent(parent: Subscription) {\n const { _parentage } = this;\n return _parentage === parent || (Array.isArray(_parentage) && _parentage.includes(parent));\n }\n\n /**\n * Adds a parent to this subscription so it can be removed from the parent if it\n * unsubscribes on it's own.\n *\n * NOTE: THIS ASSUMES THAT {@link _hasParent} HAS ALREADY BEEN CHECKED.\n * @param parent The parent subscription to add\n */\n private _addParent(parent: Subscription) {\n const { _parentage } = this;\n this._parentage = Array.isArray(_parentage) ? (_parentage.push(parent), _parentage) : _parentage ? [_parentage, parent] : parent;\n }\n\n /**\n * Called on a child when it is removed via {@link #remove}.\n * @param parent The parent to remove\n */\n private _removeParent(parent: Subscription) {\n const { _parentage } = this;\n if (_parentage === parent) {\n this._parentage = null;\n } else if (Array.isArray(_parentage)) {\n arrRemove(_parentage, parent);\n }\n }\n\n /**\n * Removes a finalizer from this subscription that was previously added with the {@link #add} method.\n *\n * Note that `Subscription` instances, when unsubscribed, will automatically remove themselves\n * from every other `Subscription` they have been added to. This means that using the `remove` method\n * is not a common thing and should be used thoughtfully.\n *\n * If you add the same finalizer instance of a function or an unsubscribable object to a `Subscription` instance\n * more than once, you will need to call `remove` the same number of times to remove all instances.\n *\n * All finalizer instances are removed to free up memory upon unsubscription.\n *\n * @param teardown The finalizer to remove from this subscription\n */\n remove(teardown: Exclude): void {\n const { _finalizers } = this;\n _finalizers && arrRemove(_finalizers, teardown);\n\n if (teardown instanceof Subscription) {\n teardown._removeParent(this);\n }\n }\n}\n\nexport const EMPTY_SUBSCRIPTION = Subscription.EMPTY;\n\nexport function isSubscription(value: any): value is Subscription {\n return (\n value instanceof Subscription ||\n (value && 'closed' in value && isFunction(value.remove) && isFunction(value.add) && isFunction(value.unsubscribe))\n );\n}\n\nfunction execFinalizer(finalizer: Unsubscribable | (() => void)) {\n if (isFunction(finalizer)) {\n finalizer();\n } else {\n finalizer.unsubscribe();\n }\n}\n", "import { Subscriber } from './Subscriber';\nimport { ObservableNotification } from './types';\n\n/**\n * The {@link GlobalConfig} object for RxJS. It is used to configure things\n * like how to react on unhandled errors.\n */\nexport const config: GlobalConfig = {\n onUnhandledError: null,\n onStoppedNotification: null,\n Promise: undefined,\n useDeprecatedSynchronousErrorHandling: false,\n useDeprecatedNextContext: false,\n};\n\n/**\n * The global configuration object for RxJS, used to configure things\n * like how to react on unhandled errors. Accessible via {@link config}\n * object.\n */\nexport interface GlobalConfig {\n /**\n * A registration point for unhandled errors from RxJS. These are errors that\n * cannot were not handled by consuming code in the usual subscription path. For\n * example, if you have this configured, and you subscribe to an observable without\n * providing an error handler, errors from that subscription will end up here. This\n * will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onUnhandledError: ((err: any) => void) | null;\n\n /**\n * A registration point for notifications that cannot be sent to subscribers because they\n * have completed, errored or have been explicitly unsubscribed. By default, next, complete\n * and error notifications sent to stopped subscribers are noops. However, sometimes callers\n * might want a different behavior. For example, with sources that attempt to report errors\n * to stopped subscribers, a caller can configure RxJS to throw an unhandled error instead.\n * This will _always_ be called asynchronously on another job in the runtime. This is because\n * we do not want errors thrown in this user-configured handler to interfere with the\n * behavior of the library.\n */\n onStoppedNotification: ((notification: ObservableNotification, subscriber: Subscriber) => void) | null;\n\n /**\n * The promise constructor used by default for {@link Observable#toPromise toPromise} and {@link Observable#forEach forEach}\n * methods.\n *\n * @deprecated As of version 8, RxJS will no longer support this sort of injection of a\n * Promise constructor. If you need a Promise implementation other than native promises,\n * please polyfill/patch Promise as you see appropriate. Will be removed in v8.\n */\n Promise?: PromiseConstructorLike;\n\n /**\n * If true, turns on synchronous error rethrowing, which is a deprecated behavior\n * in v6 and higher. This behavior enables bad patterns like wrapping a subscribe\n * call in a try/catch block. It also enables producer interference, a nasty bug\n * where a multicast can be broken for all observers by a downstream consumer with\n * an unhandled error. DO NOT USE THIS FLAG UNLESS IT'S NEEDED TO BUY TIME\n * FOR MIGRATION REASONS.\n *\n * @deprecated As of version 8, RxJS will no longer support synchronous throwing\n * of unhandled errors. All errors will be thrown on a separate call stack to prevent bad\n * behaviors described above. Will be removed in v8.\n */\n useDeprecatedSynchronousErrorHandling: boolean;\n\n /**\n * If true, enables an as-of-yet undocumented feature from v5: The ability to access\n * `unsubscribe()` via `this` context in `next` functions created in observers passed\n * to `subscribe`.\n *\n * This is being removed because the performance was severely problematic, and it could also cause\n * issues when types other than POJOs are passed to subscribe as subscribers, as they will likely have\n * their `this` context overwritten.\n *\n * @deprecated As of version 8, RxJS will no longer support altering the\n * context of next functions provided as part of an observer to Subscribe. Instead,\n * you will have access to a subscription or a signal or token that will allow you to do things like\n * unsubscribe and test closed status. Will be removed in v8.\n */\n useDeprecatedNextContext: boolean;\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetTimeoutFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearTimeoutFunction = (handle: TimerHandle) => void;\n\ninterface TimeoutProvider {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n delegate:\n | {\n setTimeout: SetTimeoutFunction;\n clearTimeout: ClearTimeoutFunction;\n }\n | undefined;\n}\n\nexport const timeoutProvider: TimeoutProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setTimeout(handler: () => void, timeout?: number, ...args) {\n const { delegate } = timeoutProvider;\n if (delegate?.setTimeout) {\n return delegate.setTimeout(handler, timeout, ...args);\n }\n return setTimeout(handler, timeout, ...args);\n },\n clearTimeout(handle) {\n const { delegate } = timeoutProvider;\n return (delegate?.clearTimeout || clearTimeout)(handle as any);\n },\n delegate: undefined,\n};\n", "import { config } from '../config';\nimport { timeoutProvider } from '../scheduler/timeoutProvider';\n\n/**\n * Handles an error on another job either with the user-configured {@link onUnhandledError},\n * or by throwing it on that new job so it can be picked up by `window.onerror`, `process.on('error')`, etc.\n *\n * This should be called whenever there is an error that is out-of-band with the subscription\n * or when an error hits a terminal boundary of the subscription and no error handler was provided.\n *\n * @param err the error to report\n */\nexport function reportUnhandledError(err: any) {\n timeoutProvider.setTimeout(() => {\n const { onUnhandledError } = config;\n if (onUnhandledError) {\n // Execute the user-configured error handler.\n onUnhandledError(err);\n } else {\n // Throw so it is picked up by the runtime's uncaught error mechanism.\n throw err;\n }\n });\n}\n", "/* tslint:disable:no-empty */\nexport function noop() { }\n", "import { CompleteNotification, NextNotification, ErrorNotification } from './types';\n\n/**\n * A completion object optimized for memory use and created to be the\n * same \"shape\" as other notifications in v8.\n * @internal\n */\nexport const COMPLETE_NOTIFICATION = (() => createNotification('C', undefined, undefined) as CompleteNotification)();\n\n/**\n * Internal use only. Creates an optimized error notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function errorNotification(error: any): ErrorNotification {\n return createNotification('E', undefined, error) as any;\n}\n\n/**\n * Internal use only. Creates an optimized next notification that is the same \"shape\"\n * as other notifications.\n * @internal\n */\nexport function nextNotification(value: T) {\n return createNotification('N', value, undefined) as NextNotification;\n}\n\n/**\n * Ensures that all notifications created internally have the same \"shape\" in v8.\n *\n * TODO: This is only exported to support a crazy legacy test in `groupBy`.\n * @internal\n */\nexport function createNotification(kind: 'N' | 'E' | 'C', value: any, error: any) {\n return {\n kind,\n value,\n error,\n };\n}\n", "import { config } from '../config';\n\nlet context: { errorThrown: boolean; error: any } | null = null;\n\n/**\n * Handles dealing with errors for super-gross mode. Creates a context, in which\n * any synchronously thrown errors will be passed to {@link captureError}. Which\n * will record the error such that it will be rethrown after the call back is complete.\n * TODO: Remove in v8\n * @param cb An immediately executed function.\n */\nexport function errorContext(cb: () => void) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n const isRoot = !context;\n if (isRoot) {\n context = { errorThrown: false, error: null };\n }\n cb();\n if (isRoot) {\n const { errorThrown, error } = context!;\n context = null;\n if (errorThrown) {\n throw error;\n }\n }\n } else {\n // This is the general non-deprecated path for everyone that\n // isn't crazy enough to use super-gross mode (useDeprecatedSynchronousErrorHandling)\n cb();\n }\n}\n\n/**\n * Captures errors only in super-gross mode.\n * @param err the error to capture\n */\nexport function captureError(err: any) {\n if (config.useDeprecatedSynchronousErrorHandling && context) {\n context.errorThrown = true;\n context.error = err;\n }\n}\n", "import { isFunction } from './util/isFunction';\nimport { Observer, ObservableNotification } from './types';\nimport { isSubscription, Subscription } from './Subscription';\nimport { config } from './config';\nimport { reportUnhandledError } from './util/reportUnhandledError';\nimport { noop } from './util/noop';\nimport { nextNotification, errorNotification, COMPLETE_NOTIFICATION } from './NotificationFactories';\nimport { timeoutProvider } from './scheduler/timeoutProvider';\nimport { captureError } from './util/errorContext';\n\n/**\n * Implements the {@link Observer} interface and extends the\n * {@link Subscription} class. While the {@link Observer} is the public API for\n * consuming the values of an {@link Observable}, all Observers get converted to\n * a Subscriber, in order to provide Subscription-like capabilities such as\n * `unsubscribe`. Subscriber is a common type in RxJS, and crucial for\n * implementing operators, but it is rarely used as a public API.\n *\n * @class Subscriber\n */\nexport class Subscriber extends Subscription implements Observer {\n /**\n * A static factory for a Subscriber, given a (potentially partial) definition\n * of an Observer.\n * @param next The `next` callback of an Observer.\n * @param error The `error` callback of an\n * Observer.\n * @param complete The `complete` callback of an\n * Observer.\n * @return A Subscriber wrapping the (partially defined)\n * Observer represented by the given arguments.\n * @nocollapse\n * @deprecated Do not use. Will be removed in v8. There is no replacement for this\n * method, and there is no reason to be creating instances of `Subscriber` directly.\n * If you have a specific use case, please file an issue.\n */\n static create(next?: (x?: T) => void, error?: (e?: any) => void, complete?: () => void): Subscriber {\n return new SafeSubscriber(next, error, complete);\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected isStopped: boolean = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n protected destination: Subscriber | Observer; // this `any` is the escape hatch to erase extra type param (e.g. R)\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * There is no reason to directly create an instance of Subscriber. This type is exported for typings reasons.\n */\n constructor(destination?: Subscriber | Observer) {\n super();\n if (destination) {\n this.destination = destination;\n // Automatically chain subscriptions together here.\n // if destination is a Subscription, then it is a Subscriber.\n if (isSubscription(destination)) {\n destination.add(this);\n }\n } else {\n this.destination = EMPTY_OBSERVER;\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `next` from\n * the Observable, with a value. The Observable may call this method 0 or more\n * times.\n * @param {T} [value] The `next` value.\n * @return {void}\n */\n next(value?: T): void {\n if (this.isStopped) {\n handleStoppedNotification(nextNotification(value), this);\n } else {\n this._next(value!);\n }\n }\n\n /**\n * The {@link Observer} callback to receive notifications of type `error` from\n * the Observable, with an attached `Error`. Notifies the Observer that\n * the Observable has experienced an error condition.\n * @param {any} [err] The `error` exception.\n * @return {void}\n */\n error(err?: any): void {\n if (this.isStopped) {\n handleStoppedNotification(errorNotification(err), this);\n } else {\n this.isStopped = true;\n this._error(err);\n }\n }\n\n /**\n * The {@link Observer} callback to receive a valueless notification of type\n * `complete` from the Observable. Notifies the Observer that the Observable\n * has finished sending push-based notifications.\n * @return {void}\n */\n complete(): void {\n if (this.isStopped) {\n handleStoppedNotification(COMPLETE_NOTIFICATION, this);\n } else {\n this.isStopped = true;\n this._complete();\n }\n }\n\n unsubscribe(): void {\n if (!this.closed) {\n this.isStopped = true;\n super.unsubscribe();\n this.destination = null!;\n }\n }\n\n protected _next(value: T): void {\n this.destination.next(value);\n }\n\n protected _error(err: any): void {\n try {\n this.destination.error(err);\n } finally {\n this.unsubscribe();\n }\n }\n\n protected _complete(): void {\n try {\n this.destination.complete();\n } finally {\n this.unsubscribe();\n }\n }\n}\n\n/**\n * This bind is captured here because we want to be able to have\n * compatibility with monoid libraries that tend to use a method named\n * `bind`. In particular, a library called Monio requires this.\n */\nconst _bind = Function.prototype.bind;\n\nfunction bind any>(fn: Fn, thisArg: any): Fn {\n return _bind.call(fn, thisArg);\n}\n\n/**\n * Internal optimization only, DO NOT EXPOSE.\n * @internal\n */\nclass ConsumerObserver implements Observer {\n constructor(private partialObserver: Partial>) {}\n\n next(value: T): void {\n const { partialObserver } = this;\n if (partialObserver.next) {\n try {\n partialObserver.next(value);\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n\n error(err: any): void {\n const { partialObserver } = this;\n if (partialObserver.error) {\n try {\n partialObserver.error(err);\n } catch (error) {\n handleUnhandledError(error);\n }\n } else {\n handleUnhandledError(err);\n }\n }\n\n complete(): void {\n const { partialObserver } = this;\n if (partialObserver.complete) {\n try {\n partialObserver.complete();\n } catch (error) {\n handleUnhandledError(error);\n }\n }\n }\n}\n\nexport class SafeSubscriber extends Subscriber {\n constructor(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((e?: any) => void) | null,\n complete?: (() => void) | null\n ) {\n super();\n\n let partialObserver: Partial>;\n if (isFunction(observerOrNext) || !observerOrNext) {\n // The first argument is a function, not an observer. The next\n // two arguments *could* be observers, or they could be empty.\n partialObserver = {\n next: (observerOrNext ?? undefined) as (((value: T) => void) | undefined),\n error: error ?? undefined,\n complete: complete ?? undefined,\n };\n } else {\n // The first argument is a partial observer.\n let context: any;\n if (this && config.useDeprecatedNextContext) {\n // This is a deprecated path that made `this.unsubscribe()` available in\n // next handler functions passed to subscribe. This only exists behind a flag\n // now, as it is *very* slow.\n context = Object.create(observerOrNext);\n context.unsubscribe = () => this.unsubscribe();\n partialObserver = {\n next: observerOrNext.next && bind(observerOrNext.next, context),\n error: observerOrNext.error && bind(observerOrNext.error, context),\n complete: observerOrNext.complete && bind(observerOrNext.complete, context),\n };\n } else {\n // The \"normal\" path. Just use the partial observer directly.\n partialObserver = observerOrNext;\n }\n }\n\n // Wrap the partial observer to ensure it's a full observer, and\n // make sure proper error handling is accounted for.\n this.destination = new ConsumerObserver(partialObserver);\n }\n}\n\nfunction handleUnhandledError(error: any) {\n if (config.useDeprecatedSynchronousErrorHandling) {\n captureError(error);\n } else {\n // Ideal path, we report this as an unhandled error,\n // which is thrown on a new call stack.\n reportUnhandledError(error);\n }\n}\n\n/**\n * An error handler used when no error handler was supplied\n * to the SafeSubscriber -- meaning no error handler was supplied\n * do the `subscribe` call on our observable.\n * @param err The error to handle\n */\nfunction defaultErrorHandler(err: any) {\n throw err;\n}\n\n/**\n * A handler for notifications that cannot be sent to a stopped subscriber.\n * @param notification The notification being sent\n * @param subscriber The stopped subscriber\n */\nfunction handleStoppedNotification(notification: ObservableNotification, subscriber: Subscriber) {\n const { onStoppedNotification } = config;\n onStoppedNotification && timeoutProvider.setTimeout(() => onStoppedNotification(notification, subscriber));\n}\n\n/**\n * The observer used as a stub for subscriptions where the user did not\n * pass any arguments to `subscribe`. Comes with the default error handling\n * behavior.\n */\nexport const EMPTY_OBSERVER: Readonly> & { closed: true } = {\n closed: true,\n next: noop,\n error: defaultErrorHandler,\n complete: noop,\n};\n", "/**\n * Symbol.observable or a string \"@@observable\". Used for interop\n *\n * @deprecated We will no longer be exporting this symbol in upcoming versions of RxJS.\n * Instead polyfill and use Symbol.observable directly *or* use https://www.npmjs.com/package/symbol-observable\n */\nexport const observable: string | symbol = (() => (typeof Symbol === 'function' && Symbol.observable) || '@@observable')();\n", "/**\n * This function takes one parameter and just returns it. Simply put,\n * this is like `(x: T): T => x`.\n *\n * ## Examples\n *\n * This is useful in some cases when using things like `mergeMap`\n *\n * ```ts\n * import { interval, take, map, range, mergeMap, identity } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(5));\n *\n * const result$ = source$.pipe(\n * map(i => range(i)),\n * mergeMap(identity) // same as mergeMap(x => x)\n * );\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * Or when you want to selectively apply an operator\n *\n * ```ts\n * import { interval, take, identity } from 'rxjs';\n *\n * const shouldLimit = () => Math.random() < 0.5;\n *\n * const source$ = interval(1000);\n *\n * const result$ = source$.pipe(shouldLimit() ? take(5) : identity);\n *\n * result$.subscribe({\n * next: console.log\n * });\n * ```\n *\n * @param x Any value that is returned by this function\n * @returns The value passed as the first parameter to this function\n */\nexport function identity(x: T): T {\n return x;\n}\n", "import { identity } from './identity';\nimport { UnaryFunction } from '../types';\n\nexport function pipe(): typeof identity;\nexport function pipe(fn1: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction): UnaryFunction;\nexport function pipe(fn1: UnaryFunction, fn2: UnaryFunction, fn3: UnaryFunction): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction\n): UnaryFunction;\nexport function pipe(\n fn1: UnaryFunction,\n fn2: UnaryFunction,\n fn3: UnaryFunction,\n fn4: UnaryFunction,\n fn5: UnaryFunction,\n fn6: UnaryFunction,\n fn7: UnaryFunction,\n fn8: UnaryFunction,\n fn9: UnaryFunction,\n ...fns: UnaryFunction[]\n): UnaryFunction;\n\n/**\n * pipe() can be called on one or more functions, each of which can take one argument (\"UnaryFunction\")\n * and uses it to return a value.\n * It returns a function that takes one argument, passes it to the first UnaryFunction, and then\n * passes the result to the next one, passes that result to the next one, and so on. \n */\nexport function pipe(...fns: Array>): UnaryFunction {\n return pipeFromArray(fns);\n}\n\n/** @internal */\nexport function pipeFromArray(fns: Array>): UnaryFunction {\n if (fns.length === 0) {\n return identity as UnaryFunction;\n }\n\n if (fns.length === 1) {\n return fns[0];\n }\n\n return function piped(input: T): R {\n return fns.reduce((prev: any, fn: UnaryFunction) => fn(prev), input as any);\n };\n}\n", "import { Operator } from './Operator';\nimport { SafeSubscriber, Subscriber } from './Subscriber';\nimport { isSubscription, Subscription } from './Subscription';\nimport { TeardownLogic, OperatorFunction, Subscribable, Observer } from './types';\nimport { observable as Symbol_observable } from './symbol/observable';\nimport { pipeFromArray } from './util/pipe';\nimport { config } from './config';\nimport { isFunction } from './util/isFunction';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A representation of any set of values over any amount of time. This is the most basic building block\n * of RxJS.\n *\n * @class Observable\n */\nexport class Observable implements Subscribable {\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n source: Observable | undefined;\n\n /**\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n */\n operator: Operator | undefined;\n\n /**\n * @constructor\n * @param {Function} subscribe the function that is called when the Observable is\n * initially subscribed to. This function is given a Subscriber, to which new values\n * can be `next`ed, or an `error` method can be called to raise an error, or\n * `complete` can be called to notify of a successful completion.\n */\n constructor(subscribe?: (this: Observable, subscriber: Subscriber) => TeardownLogic) {\n if (subscribe) {\n this._subscribe = subscribe;\n }\n }\n\n // HACK: Since TypeScript inherits static properties too, we have to\n // fight against TypeScript here so Subject can have a different static create signature\n /**\n * Creates a new Observable by calling the Observable constructor\n * @owner Observable\n * @method create\n * @param {Function} subscribe? the subscriber function to be passed to the Observable constructor\n * @return {Observable} a new observable\n * @nocollapse\n * @deprecated Use `new Observable()` instead. Will be removed in v8.\n */\n static create: (...args: any[]) => any = (subscribe?: (subscriber: Subscriber) => TeardownLogic) => {\n return new Observable(subscribe);\n };\n\n /**\n * Creates a new Observable, with this Observable instance as the source, and the passed\n * operator defined as the new observable's operator.\n * @method lift\n * @param operator the operator defining the operation to take on the observable\n * @return a new observable with the Operator applied\n * @deprecated Internal implementation detail, do not use directly. Will be made internal in v8.\n * If you have implemented an operator using `lift`, it is recommended that you create an\n * operator by simply returning `new Observable()` directly. See \"Creating new operators from\n * scratch\" section here: https://rxjs.dev/guide/operators\n */\n lift(operator?: Operator): Observable {\n const observable = new Observable();\n observable.source = this;\n observable.operator = operator;\n return observable;\n }\n\n subscribe(observerOrNext?: Partial> | ((value: T) => void)): Subscription;\n /** @deprecated Instead of passing separate callback arguments, use an observer argument. Signatures taking separate callback arguments will be removed in v8. Details: https://rxjs.dev/deprecations/subscribe-arguments */\n subscribe(next?: ((value: T) => void) | null, error?: ((error: any) => void) | null, complete?: (() => void) | null): Subscription;\n /**\n * Invokes an execution of an Observable and registers Observer handlers for notifications it will emit.\n *\n * Use it when you have all these Observables, but still nothing is happening.\n *\n * `subscribe` is not a regular operator, but a method that calls Observable's internal `subscribe` function. It\n * might be for example a function that you passed to Observable's constructor, but most of the time it is\n * a library implementation, which defines what will be emitted by an Observable, and when it be will emitted. This means\n * that calling `subscribe` is actually the moment when Observable starts its work, not when it is created, as it is often\n * the thought.\n *\n * Apart from starting the execution of an Observable, this method allows you to listen for values\n * that an Observable emits, as well as for when it completes or errors. You can achieve this in two\n * of the following ways.\n *\n * The first way is creating an object that implements {@link Observer} interface. It should have methods\n * defined by that interface, but note that it should be just a regular JavaScript object, which you can create\n * yourself in any way you want (ES6 class, classic function constructor, object literal etc.). In particular, do\n * not attempt to use any RxJS implementation details to create Observers - you don't need them. Remember also\n * that your object does not have to implement all methods. If you find yourself creating a method that doesn't\n * do anything, you can simply omit it. Note however, if the `error` method is not provided and an error happens,\n * it will be thrown asynchronously. Errors thrown asynchronously cannot be caught using `try`/`catch`. Instead,\n * use the {@link onUnhandledError} configuration option or use a runtime handler (like `window.onerror` or\n * `process.on('error)`) to be notified of unhandled errors. Because of this, it's recommended that you provide\n * an `error` method to avoid missing thrown errors.\n *\n * The second way is to give up on Observer object altogether and simply provide callback functions in place of its methods.\n * This means you can provide three functions as arguments to `subscribe`, where the first function is equivalent\n * of a `next` method, the second of an `error` method and the third of a `complete` method. Just as in case of an Observer,\n * if you do not need to listen for something, you can omit a function by passing `undefined` or `null`,\n * since `subscribe` recognizes these functions by where they were placed in function call. When it comes\n * to the `error` function, as with an Observer, if not provided, errors emitted by an Observable will be thrown asynchronously.\n *\n * You can, however, subscribe with no parameters at all. This may be the case where you're not interested in terminal events\n * and you also handled emissions internally by using operators (e.g. using `tap`).\n *\n * Whichever style of calling `subscribe` you use, in both cases it returns a Subscription object.\n * This object allows you to call `unsubscribe` on it, which in turn will stop the work that an Observable does and will clean\n * up all resources that an Observable used. Note that cancelling a subscription will not call `complete` callback\n * provided to `subscribe` function, which is reserved for a regular completion signal that comes from an Observable.\n *\n * Remember that callbacks provided to `subscribe` are not guaranteed to be called asynchronously.\n * It is an Observable itself that decides when these functions will be called. For example {@link of}\n * by default emits all its values synchronously. Always check documentation for how given Observable\n * will behave when subscribed and if its default behavior can be modified with a `scheduler`.\n *\n * #### Examples\n *\n * Subscribe with an {@link guide/observer Observer}\n *\n * ```ts\n * import { of } from 'rxjs';\n *\n * const sumObserver = {\n * sum: 0,\n * next(value) {\n * console.log('Adding: ' + value);\n * this.sum = this.sum + value;\n * },\n * error() {\n * // We actually could just remove this method,\n * // since we do not really care about errors right now.\n * },\n * complete() {\n * console.log('Sum equals: ' + this.sum);\n * }\n * };\n *\n * of(1, 2, 3) // Synchronously emits 1, 2, 3 and then completes.\n * .subscribe(sumObserver);\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Subscribe with functions ({@link deprecations/subscribe-arguments deprecated})\n *\n * ```ts\n * import { of } from 'rxjs'\n *\n * let sum = 0;\n *\n * of(1, 2, 3).subscribe(\n * value => {\n * console.log('Adding: ' + value);\n * sum = sum + value;\n * },\n * undefined,\n * () => console.log('Sum equals: ' + sum)\n * );\n *\n * // Logs:\n * // 'Adding: 1'\n * // 'Adding: 2'\n * // 'Adding: 3'\n * // 'Sum equals: 6'\n * ```\n *\n * Cancel a subscription\n *\n * ```ts\n * import { interval } from 'rxjs';\n *\n * const subscription = interval(1000).subscribe({\n * next(num) {\n * console.log(num)\n * },\n * complete() {\n * // Will not be called, even when cancelling subscription.\n * console.log('completed!');\n * }\n * });\n *\n * setTimeout(() => {\n * subscription.unsubscribe();\n * console.log('unsubscribed!');\n * }, 2500);\n *\n * // Logs:\n * // 0 after 1s\n * // 1 after 2s\n * // 'unsubscribed!' after 2.5s\n * ```\n *\n * @param {Observer|Function} observerOrNext (optional) Either an observer with methods to be called,\n * or the first of three possible handlers, which is the handler for each value emitted from the subscribed\n * Observable.\n * @param {Function} error (optional) A handler for a terminal event resulting from an error. If no error handler is provided,\n * the error will be thrown asynchronously as unhandled.\n * @param {Function} complete (optional) A handler for a terminal event resulting from successful completion.\n * @return {Subscription} a subscription reference to the registered handlers\n * @method subscribe\n */\n subscribe(\n observerOrNext?: Partial> | ((value: T) => void) | null,\n error?: ((error: any) => void) | null,\n complete?: (() => void) | null\n ): Subscription {\n const subscriber = isSubscriber(observerOrNext) ? observerOrNext : new SafeSubscriber(observerOrNext, error, complete);\n\n errorContext(() => {\n const { operator, source } = this;\n subscriber.add(\n operator\n ? // We're dealing with a subscription in the\n // operator chain to one of our lifted operators.\n operator.call(subscriber, source)\n : source\n ? // If `source` has a value, but `operator` does not, something that\n // had intimate knowledge of our API, like our `Subject`, must have\n // set it. We're going to just call `_subscribe` directly.\n this._subscribe(subscriber)\n : // In all other cases, we're likely wrapping a user-provided initializer\n // function, so we need to catch errors and handle them appropriately.\n this._trySubscribe(subscriber)\n );\n });\n\n return subscriber;\n }\n\n /** @internal */\n protected _trySubscribe(sink: Subscriber): TeardownLogic {\n try {\n return this._subscribe(sink);\n } catch (err) {\n // We don't need to return anything in this case,\n // because it's just going to try to `add()` to a subscription\n // above.\n sink.error(err);\n }\n }\n\n /**\n * Used as a NON-CANCELLABLE means of subscribing to an observable, for use with\n * APIs that expect promises, like `async/await`. You cannot unsubscribe from this.\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * #### Example\n *\n * ```ts\n * import { interval, take } from 'rxjs';\n *\n * const source$ = interval(1000).pipe(take(4));\n *\n * async function getTotal() {\n * let total = 0;\n *\n * await source$.forEach(value => {\n * total += value;\n * console.log('observable -> ' + value);\n * });\n *\n * return total;\n * }\n *\n * getTotal().then(\n * total => console.log('Total: ' + total)\n * );\n *\n * // Expected:\n * // 'observable -> 0'\n * // 'observable -> 1'\n * // 'observable -> 2'\n * // 'observable -> 3'\n * // 'Total: 6'\n * ```\n *\n * @param next a handler for each value emitted by the observable\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n */\n forEach(next: (value: T) => void): Promise;\n\n /**\n * @param next a handler for each value emitted by the observable\n * @param promiseCtor a constructor function used to instantiate the Promise\n * @return a promise that either resolves on observable completion or\n * rejects with the handled error\n * @deprecated Passing a Promise constructor will no longer be available\n * in upcoming versions of RxJS. This is because it adds weight to the library, for very\n * little benefit. If you need this functionality, it is recommended that you either\n * polyfill Promise, or you create an adapter to convert the returned native promise\n * to whatever promise implementation you wanted. Will be removed in v8.\n */\n forEach(next: (value: T) => void, promiseCtor: PromiseConstructorLike): Promise;\n\n forEach(next: (value: T) => void, promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n const subscriber = new SafeSubscriber({\n next: (value) => {\n try {\n next(value);\n } catch (err) {\n reject(err);\n subscriber.unsubscribe();\n }\n },\n error: reject,\n complete: resolve,\n });\n this.subscribe(subscriber);\n }) as Promise;\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): TeardownLogic {\n return this.source?.subscribe(subscriber);\n }\n\n /**\n * An interop point defined by the es7-observable spec https://github.com/zenparsing/es-observable\n * @method Symbol.observable\n * @return {Observable} this instance of the observable\n */\n [Symbol_observable]() {\n return this;\n }\n\n /* tslint:disable:max-line-length */\n pipe(): Observable;\n pipe(op1: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction): Observable;\n pipe(op1: OperatorFunction, op2: OperatorFunction, op3: OperatorFunction): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction\n ): Observable;\n pipe(\n op1: OperatorFunction,\n op2: OperatorFunction,\n op3: OperatorFunction,\n op4: OperatorFunction,\n op5: OperatorFunction,\n op6: OperatorFunction,\n op7: OperatorFunction,\n op8: OperatorFunction,\n op9: OperatorFunction,\n ...operations: OperatorFunction[]\n ): Observable;\n /* tslint:enable:max-line-length */\n\n /**\n * Used to stitch together functional operators into a chain.\n * @method pipe\n * @return {Observable} the Observable result of all of the operators having\n * been called in the order they were passed in.\n *\n * ## Example\n *\n * ```ts\n * import { interval, filter, map, scan } from 'rxjs';\n *\n * interval(1000)\n * .pipe(\n * filter(x => x % 2 === 0),\n * map(x => x + x),\n * scan((acc, x) => acc + x)\n * )\n * .subscribe(x => console.log(x));\n * ```\n */\n pipe(...operations: OperatorFunction[]): Observable {\n return pipeFromArray(operations)(this);\n }\n\n /* tslint:disable:max-line-length */\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: typeof Promise): Promise;\n /** @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise */\n toPromise(PromiseCtor: PromiseConstructorLike): Promise;\n /* tslint:enable:max-line-length */\n\n /**\n * Subscribe to this Observable and get a Promise resolving on\n * `complete` with the last emission (if any).\n *\n * **WARNING**: Only use this with observables you *know* will complete. If the source\n * observable does not complete, you will end up with a promise that is hung up, and\n * potentially all of the state of an async function hanging out in memory. To avoid\n * this situation, look into adding something like {@link timeout}, {@link take},\n * {@link takeWhile}, or {@link takeUntil} amongst others.\n *\n * @method toPromise\n * @param [promiseCtor] a constructor function used to instantiate\n * the Promise\n * @return A Promise that resolves with the last value emit, or\n * rejects on an error. If there were no emissions, Promise\n * resolves with undefined.\n * @deprecated Replaced with {@link firstValueFrom} and {@link lastValueFrom}. Will be removed in v8. Details: https://rxjs.dev/deprecations/to-promise\n */\n toPromise(promiseCtor?: PromiseConstructorLike): Promise {\n promiseCtor = getPromiseCtor(promiseCtor);\n\n return new promiseCtor((resolve, reject) => {\n let value: T | undefined;\n this.subscribe(\n (x: T) => (value = x),\n (err: any) => reject(err),\n () => resolve(value)\n );\n }) as Promise;\n }\n}\n\n/**\n * Decides between a passed promise constructor from consuming code,\n * A default configured promise constructor, and the native promise\n * constructor and returns it. If nothing can be found, it will throw\n * an error.\n * @param promiseCtor The optional promise constructor to passed by consuming code\n */\nfunction getPromiseCtor(promiseCtor: PromiseConstructorLike | undefined) {\n return promiseCtor ?? config.Promise ?? Promise;\n}\n\nfunction isObserver(value: any): value is Observer {\n return value && isFunction(value.next) && isFunction(value.error) && isFunction(value.complete);\n}\n\nfunction isSubscriber(value: any): value is Subscriber {\n return (value && value instanceof Subscriber) || (isObserver(value) && isSubscription(value));\n}\n", "import { Observable } from '../Observable';\nimport { Subscriber } from '../Subscriber';\nimport { OperatorFunction } from '../types';\nimport { isFunction } from './isFunction';\n\n/**\n * Used to determine if an object is an Observable with a lift function.\n */\nexport function hasLift(source: any): source is { lift: InstanceType['lift'] } {\n return isFunction(source?.lift);\n}\n\n/**\n * Creates an `OperatorFunction`. Used to define operators throughout the library in a concise way.\n * @param init The logic to connect the liftedSource to the subscriber at the moment of subscription.\n */\nexport function operate(\n init: (liftedSource: Observable, subscriber: Subscriber) => (() => void) | void\n): OperatorFunction {\n return (source: Observable) => {\n if (hasLift(source)) {\n return source.lift(function (this: Subscriber, liftedSource: Observable) {\n try {\n return init(liftedSource, this);\n } catch (err) {\n this.error(err);\n }\n });\n }\n throw new TypeError('Unable to lift unknown Observable type');\n };\n}\n", "import { Subscriber } from '../Subscriber';\n\n/**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional teardown logic here. This will only be called on teardown if the\n * subscriber itself is not already closed. This is called after all other teardown logic is executed.\n */\nexport function createOperatorSubscriber(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n onFinalize?: () => void\n): Subscriber {\n return new OperatorSubscriber(destination, onNext, onComplete, onError, onFinalize);\n}\n\n/**\n * A generic helper for allowing operators to be created with a Subscriber and\n * use closures to capture necessary state from the operator function itself.\n */\nexport class OperatorSubscriber extends Subscriber {\n /**\n * Creates an instance of an `OperatorSubscriber`.\n * @param destination The downstream subscriber.\n * @param onNext Handles next values, only called if this subscriber is not stopped or closed. Any\n * error that occurs in this function is caught and sent to the `error` method of this subscriber.\n * @param onError Handles errors from the subscription, any errors that occur in this handler are caught\n * and send to the `destination` error handler.\n * @param onComplete Handles completion notification from the subscription. Any errors that occur in\n * this handler are sent to the `destination` error handler.\n * @param onFinalize Additional finalization logic here. This will only be called on finalization if the\n * subscriber itself is not already closed. This is called after all other finalization logic is executed.\n * @param shouldUnsubscribe An optional check to see if an unsubscribe call should truly unsubscribe.\n * NOTE: This currently **ONLY** exists to support the strange behavior of {@link groupBy}, where unsubscription\n * to the resulting observable does not actually disconnect from the source if there are active subscriptions\n * to any grouped observable. (DO NOT EXPOSE OR USE EXTERNALLY!!!)\n */\n constructor(\n destination: Subscriber,\n onNext?: (value: T) => void,\n onComplete?: () => void,\n onError?: (err: any) => void,\n private onFinalize?: () => void,\n private shouldUnsubscribe?: () => boolean\n ) {\n // It's important - for performance reasons - that all of this class's\n // members are initialized and that they are always initialized in the same\n // order. This will ensure that all OperatorSubscriber instances have the\n // same hidden class in V8. This, in turn, will help keep the number of\n // hidden classes involved in property accesses within the base class as\n // low as possible. If the number of hidden classes involved exceeds four,\n // the property accesses will become megamorphic and performance penalties\n // will be incurred - i.e. inline caches won't be used.\n //\n // The reasons for ensuring all instances have the same hidden class are\n // further discussed in this blog post from Benedikt Meurer:\n // https://benediktmeurer.de/2018/03/23/impact-of-polymorphism-on-component-based-frameworks-like-react/\n super(destination);\n this._next = onNext\n ? function (this: OperatorSubscriber, value: T) {\n try {\n onNext(value);\n } catch (err) {\n destination.error(err);\n }\n }\n : super._next;\n this._error = onError\n ? function (this: OperatorSubscriber, err: any) {\n try {\n onError(err);\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._error;\n this._complete = onComplete\n ? function (this: OperatorSubscriber) {\n try {\n onComplete();\n } catch (err) {\n // Send any errors that occur down stream.\n destination.error(err);\n } finally {\n // Ensure finalization.\n this.unsubscribe();\n }\n }\n : super._complete;\n }\n\n unsubscribe() {\n if (!this.shouldUnsubscribe || this.shouldUnsubscribe()) {\n const { closed } = this;\n super.unsubscribe();\n // Execute additional teardown if we have any and we didn't already do so.\n !closed && this.onFinalize?.();\n }\n }\n}\n", "import { Subscription } from '../Subscription';\n\ninterface AnimationFrameProvider {\n schedule(callback: FrameRequestCallback): Subscription;\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n delegate:\n | {\n requestAnimationFrame: typeof requestAnimationFrame;\n cancelAnimationFrame: typeof cancelAnimationFrame;\n }\n | undefined;\n}\n\nexport const animationFrameProvider: AnimationFrameProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n schedule(callback) {\n let request = requestAnimationFrame;\n let cancel: typeof cancelAnimationFrame | undefined = cancelAnimationFrame;\n const { delegate } = animationFrameProvider;\n if (delegate) {\n request = delegate.requestAnimationFrame;\n cancel = delegate.cancelAnimationFrame;\n }\n const handle = request((timestamp) => {\n // Clear the cancel function. The request has been fulfilled, so\n // attempting to cancel the request upon unsubscription would be\n // pointless.\n cancel = undefined;\n callback(timestamp);\n });\n return new Subscription(() => cancel?.(handle));\n },\n requestAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.requestAnimationFrame || requestAnimationFrame)(...args);\n },\n cancelAnimationFrame(...args) {\n const { delegate } = animationFrameProvider;\n return (delegate?.cancelAnimationFrame || cancelAnimationFrame)(...args);\n },\n delegate: undefined,\n};\n", "import { createErrorClass } from './createErrorClass';\n\nexport interface ObjectUnsubscribedError extends Error {}\n\nexport interface ObjectUnsubscribedErrorCtor {\n /**\n * @deprecated Internal implementation detail. Do not construct error instances.\n * Cannot be tagged as internal: https://github.com/ReactiveX/rxjs/issues/6269\n */\n new (): ObjectUnsubscribedError;\n}\n\n/**\n * An error thrown when an action is invalid because the object has been\n * unsubscribed.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n *\n * @class ObjectUnsubscribedError\n */\nexport const ObjectUnsubscribedError: ObjectUnsubscribedErrorCtor = createErrorClass(\n (_super) =>\n function ObjectUnsubscribedErrorImpl(this: any) {\n _super(this);\n this.name = 'ObjectUnsubscribedError';\n this.message = 'object unsubscribed';\n }\n);\n", "import { Operator } from './Operator';\nimport { Observable } from './Observable';\nimport { Subscriber } from './Subscriber';\nimport { Subscription, EMPTY_SUBSCRIPTION } from './Subscription';\nimport { Observer, SubscriptionLike, TeardownLogic } from './types';\nimport { ObjectUnsubscribedError } from './util/ObjectUnsubscribedError';\nimport { arrRemove } from './util/arrRemove';\nimport { errorContext } from './util/errorContext';\n\n/**\n * A Subject is a special type of Observable that allows values to be\n * multicasted to many Observers. Subjects are like EventEmitters.\n *\n * Every Subject is an Observable and an Observer. You can subscribe to a\n * Subject, and you can call next to feed values as well as error and complete.\n */\nexport class Subject extends Observable implements SubscriptionLike {\n closed = false;\n\n private currentObservers: Observer[] | null = null;\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n observers: Observer[] = [];\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n isStopped = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n hasError = false;\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n thrownError: any = null;\n\n /**\n * Creates a \"subject\" by basically gluing an observer to an observable.\n *\n * @nocollapse\n * @deprecated Recommended you do not use. Will be removed at some point in the future. Plans for replacement still under discussion.\n */\n static create: (...args: any[]) => any = (destination: Observer, source: Observable): AnonymousSubject => {\n return new AnonymousSubject(destination, source);\n };\n\n constructor() {\n // NOTE: This must be here to obscure Observable's constructor.\n super();\n }\n\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n lift(operator: Operator): Observable {\n const subject = new AnonymousSubject(this, this);\n subject.operator = operator as any;\n return subject as any;\n }\n\n /** @internal */\n protected _throwIfClosed() {\n if (this.closed) {\n throw new ObjectUnsubscribedError();\n }\n }\n\n next(value: T) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n if (!this.currentObservers) {\n this.currentObservers = Array.from(this.observers);\n }\n for (const observer of this.currentObservers) {\n observer.next(value);\n }\n }\n });\n }\n\n error(err: any) {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.hasError = this.isStopped = true;\n this.thrownError = err;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.error(err);\n }\n }\n });\n }\n\n complete() {\n errorContext(() => {\n this._throwIfClosed();\n if (!this.isStopped) {\n this.isStopped = true;\n const { observers } = this;\n while (observers.length) {\n observers.shift()!.complete();\n }\n }\n });\n }\n\n unsubscribe() {\n this.isStopped = this.closed = true;\n this.observers = this.currentObservers = null!;\n }\n\n get observed() {\n return this.observers?.length > 0;\n }\n\n /** @internal */\n protected _trySubscribe(subscriber: Subscriber): TeardownLogic {\n this._throwIfClosed();\n return super._trySubscribe(subscriber);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._checkFinalizedStatuses(subscriber);\n return this._innerSubscribe(subscriber);\n }\n\n /** @internal */\n protected _innerSubscribe(subscriber: Subscriber) {\n const { hasError, isStopped, observers } = this;\n if (hasError || isStopped) {\n return EMPTY_SUBSCRIPTION;\n }\n this.currentObservers = null;\n observers.push(subscriber);\n return new Subscription(() => {\n this.currentObservers = null;\n arrRemove(observers, subscriber);\n });\n }\n\n /** @internal */\n protected _checkFinalizedStatuses(subscriber: Subscriber) {\n const { hasError, thrownError, isStopped } = this;\n if (hasError) {\n subscriber.error(thrownError);\n } else if (isStopped) {\n subscriber.complete();\n }\n }\n\n /**\n * Creates a new Observable with this Subject as the source. You can do this\n * to create custom Observer-side logic of the Subject and conceal it from\n * code that uses the Observable.\n * @return {Observable} Observable that the Subject casts to\n */\n asObservable(): Observable {\n const observable: any = new Observable();\n observable.source = this;\n return observable;\n }\n}\n\n/**\n * @class AnonymousSubject\n */\nexport class AnonymousSubject extends Subject {\n constructor(\n /** @deprecated Internal implementation detail, do not use directly. Will be made internal in v8. */\n public destination?: Observer,\n source?: Observable\n ) {\n super();\n this.source = source;\n }\n\n next(value: T) {\n this.destination?.next?.(value);\n }\n\n error(err: any) {\n this.destination?.error?.(err);\n }\n\n complete() {\n this.destination?.complete?.();\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n return this.source?.subscribe(subscriber) ?? EMPTY_SUBSCRIPTION;\n }\n}\n", "import { TimestampProvider } from '../types';\n\ninterface DateTimestampProvider extends TimestampProvider {\n delegate: TimestampProvider | undefined;\n}\n\nexport const dateTimestampProvider: DateTimestampProvider = {\n now() {\n // Use the variable rather than `this` so that the function can be called\n // without being bound to the provider.\n return (dateTimestampProvider.delegate || Date).now();\n },\n delegate: undefined,\n};\n", "import { Subject } from './Subject';\nimport { TimestampProvider } from './types';\nimport { Subscriber } from './Subscriber';\nimport { Subscription } from './Subscription';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * A variant of {@link Subject} that \"replays\" old values to new subscribers by emitting them when they first subscribe.\n *\n * `ReplaySubject` has an internal buffer that will store a specified number of values that it has observed. Like `Subject`,\n * `ReplaySubject` \"observes\" values by having them passed to its `next` method. When it observes a value, it will store that\n * value for a time determined by the configuration of the `ReplaySubject`, as passed to its constructor.\n *\n * When a new subscriber subscribes to the `ReplaySubject` instance, it will synchronously emit all values in its buffer in\n * a First-In-First-Out (FIFO) manner. The `ReplaySubject` will also complete, if it has observed completion; and it will\n * error if it has observed an error.\n *\n * There are two main configuration items to be concerned with:\n *\n * 1. `bufferSize` - This will determine how many items are stored in the buffer, defaults to infinite.\n * 2. `windowTime` - The amount of time to hold a value in the buffer before removing it from the buffer.\n *\n * Both configurations may exist simultaneously. So if you would like to buffer a maximum of 3 values, as long as the values\n * are less than 2 seconds old, you could do so with a `new ReplaySubject(3, 2000)`.\n *\n * ### Differences with BehaviorSubject\n *\n * `BehaviorSubject` is similar to `new ReplaySubject(1)`, with a couple of exceptions:\n *\n * 1. `BehaviorSubject` comes \"primed\" with a single value upon construction.\n * 2. `ReplaySubject` will replay values, even after observing an error, where `BehaviorSubject` will not.\n *\n * @see {@link Subject}\n * @see {@link BehaviorSubject}\n * @see {@link shareReplay}\n */\nexport class ReplaySubject extends Subject {\n private _buffer: (T | number)[] = [];\n private _infiniteTimeWindow = true;\n\n /**\n * @param bufferSize The size of the buffer to replay on subscription\n * @param windowTime The amount of time the buffered items will stay buffered\n * @param timestampProvider An object with a `now()` method that provides the current timestamp. This is used to\n * calculate the amount of time something has been buffered.\n */\n constructor(\n private _bufferSize = Infinity,\n private _windowTime = Infinity,\n private _timestampProvider: TimestampProvider = dateTimestampProvider\n ) {\n super();\n this._infiniteTimeWindow = _windowTime === Infinity;\n this._bufferSize = Math.max(1, _bufferSize);\n this._windowTime = Math.max(1, _windowTime);\n }\n\n next(value: T): void {\n const { isStopped, _buffer, _infiniteTimeWindow, _timestampProvider, _windowTime } = this;\n if (!isStopped) {\n _buffer.push(value);\n !_infiniteTimeWindow && _buffer.push(_timestampProvider.now() + _windowTime);\n }\n this._trimBuffer();\n super.next(value);\n }\n\n /** @internal */\n protected _subscribe(subscriber: Subscriber): Subscription {\n this._throwIfClosed();\n this._trimBuffer();\n\n const subscription = this._innerSubscribe(subscriber);\n\n const { _infiniteTimeWindow, _buffer } = this;\n // We use a copy here, so reentrant code does not mutate our array while we're\n // emitting it to a new subscriber.\n const copy = _buffer.slice();\n for (let i = 0; i < copy.length && !subscriber.closed; i += _infiniteTimeWindow ? 1 : 2) {\n subscriber.next(copy[i] as T);\n }\n\n this._checkFinalizedStatuses(subscriber);\n\n return subscription;\n }\n\n private _trimBuffer() {\n const { _bufferSize, _timestampProvider, _buffer, _infiniteTimeWindow } = this;\n // If we don't have an infinite buffer size, and we're over the length,\n // use splice to truncate the old buffer values off. Note that we have to\n // double the size for instances where we're not using an infinite time window\n // because we're storing the values and the timestamps in the same array.\n const adjustedBufferSize = (_infiniteTimeWindow ? 1 : 2) * _bufferSize;\n _bufferSize < Infinity && adjustedBufferSize < _buffer.length && _buffer.splice(0, _buffer.length - adjustedBufferSize);\n\n // Now, if we're not in an infinite time window, remove all values where the time is\n // older than what is allowed.\n if (!_infiniteTimeWindow) {\n const now = _timestampProvider.now();\n let last = 0;\n // Search the array for the first timestamp that isn't expired and\n // truncate the buffer up to that point.\n for (let i = 1; i < _buffer.length && (_buffer[i] as number) <= now; i += 2) {\n last = i;\n }\n last && _buffer.splice(0, last + 1);\n }\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Subscription } from '../Subscription';\nimport { SchedulerAction } from '../types';\n\n/**\n * A unit of work to be executed in a `scheduler`. An action is typically\n * created from within a {@link SchedulerLike} and an RxJS user does not need to concern\n * themselves about creating and manipulating an Action.\n *\n * ```ts\n * class Action extends Subscription {\n * new (scheduler: Scheduler, work: (state?: T) => void);\n * schedule(state?: T, delay: number = 0): Subscription;\n * }\n * ```\n *\n * @class Action\n */\nexport class Action extends Subscription {\n constructor(scheduler: Scheduler, work: (this: SchedulerAction, state?: T) => void) {\n super();\n }\n /**\n * Schedules this action on its parent {@link SchedulerLike} for execution. May be passed\n * some context object, `state`. May happen at some point in the future,\n * according to the `delay` parameter, if specified.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler.\n * @return {void}\n */\n public schedule(state?: T, delay: number = 0): Subscription {\n return this;\n }\n}\n", "import type { TimerHandle } from './timerHandle';\ntype SetIntervalFunction = (handler: () => void, timeout?: number, ...args: any[]) => TimerHandle;\ntype ClearIntervalFunction = (handle: TimerHandle) => void;\n\ninterface IntervalProvider {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n delegate:\n | {\n setInterval: SetIntervalFunction;\n clearInterval: ClearIntervalFunction;\n }\n | undefined;\n}\n\nexport const intervalProvider: IntervalProvider = {\n // When accessing the delegate, use the variable rather than `this` so that\n // the functions can be called without being bound to the provider.\n setInterval(handler: () => void, timeout?: number, ...args) {\n const { delegate } = intervalProvider;\n if (delegate?.setInterval) {\n return delegate.setInterval(handler, timeout, ...args);\n }\n return setInterval(handler, timeout, ...args);\n },\n clearInterval(handle) {\n const { delegate } = intervalProvider;\n return (delegate?.clearInterval || clearInterval)(handle as any);\n },\n delegate: undefined,\n};\n", "import { Action } from './Action';\nimport { SchedulerAction } from '../types';\nimport { Subscription } from '../Subscription';\nimport { AsyncScheduler } from './AsyncScheduler';\nimport { intervalProvider } from './intervalProvider';\nimport { arrRemove } from '../util/arrRemove';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncAction extends Action {\n public id: TimerHandle | undefined;\n public state?: T;\n // @ts-ignore: Property has no initializer and is not definitely assigned\n public delay: number;\n protected pending: boolean = false;\n\n constructor(protected scheduler: AsyncScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n public schedule(state?: T, delay: number = 0): Subscription {\n if (this.closed) {\n return this;\n }\n\n // Always replace the current state with the new state.\n this.state = state;\n\n const id = this.id;\n const scheduler = this.scheduler;\n\n //\n // Important implementation note:\n //\n // Actions only execute once by default, unless rescheduled from within the\n // scheduled callback. This allows us to implement single and repeat\n // actions via the same code path, without adding API surface area, as well\n // as mimic traditional recursion but across asynchronous boundaries.\n //\n // However, JS runtimes and timers distinguish between intervals achieved by\n // serial `setTimeout` calls vs. a single `setInterval` call. An interval of\n // serial `setTimeout` calls can be individually delayed, which delays\n // scheduling the next `setTimeout`, and so on. `setInterval` attempts to\n // guarantee the interval callback will be invoked more precisely to the\n // interval period, regardless of load.\n //\n // Therefore, we use `setInterval` to schedule single and repeat actions.\n // If the action reschedules itself with the same delay, the interval is not\n // canceled. If the action doesn't reschedule, or reschedules with a\n // different delay, the interval will be canceled after scheduled callback\n // execution.\n //\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, delay);\n }\n\n // Set the pending flag indicating that this action has been scheduled, or\n // has recursively rescheduled itself.\n this.pending = true;\n\n this.delay = delay;\n // If this action has already an async Id, don't request a new one.\n this.id = this.id ?? this.requestAsyncId(scheduler, this.id, delay);\n\n return this;\n }\n\n protected requestAsyncId(scheduler: AsyncScheduler, _id?: TimerHandle, delay: number = 0): TimerHandle {\n return intervalProvider.setInterval(scheduler.flush.bind(scheduler, this), delay);\n }\n\n protected recycleAsyncId(_scheduler: AsyncScheduler, id?: TimerHandle, delay: number | null = 0): TimerHandle | undefined {\n // If this action is rescheduled with the same delay time, don't clear the interval id.\n if (delay != null && this.delay === delay && this.pending === false) {\n return id;\n }\n // Otherwise, if the action's delay time is different from the current delay,\n // or the action has been rescheduled before it's executed, clear the interval id\n if (id != null) {\n intervalProvider.clearInterval(id);\n }\n\n return undefined;\n }\n\n /**\n * Immediately executes this action and the `work` it contains.\n * @return {any}\n */\n public execute(state: T, delay: number): any {\n if (this.closed) {\n return new Error('executing a cancelled action');\n }\n\n this.pending = false;\n const error = this._execute(state, delay);\n if (error) {\n return error;\n } else if (this.pending === false && this.id != null) {\n // Dequeue if the action didn't reschedule itself. Don't call\n // unsubscribe(), because the action could reschedule later.\n // For example:\n // ```\n // scheduler.schedule(function doWork(counter) {\n // /* ... I'm a busy worker bee ... */\n // var originalAction = this;\n // /* wait 100ms before rescheduling the action */\n // setTimeout(function () {\n // originalAction.schedule(counter + 1);\n // }, 100);\n // }, 1000);\n // ```\n this.id = this.recycleAsyncId(this.scheduler, this.id, null);\n }\n }\n\n protected _execute(state: T, _delay: number): any {\n let errored: boolean = false;\n let errorValue: any;\n try {\n this.work(state);\n } catch (e) {\n errored = true;\n // HACK: Since code elsewhere is relying on the \"truthiness\" of the\n // return here, we can't have it return \"\" or 0 or false.\n // TODO: Clean this up when we refactor schedulers mid-version-8 or so.\n errorValue = e ? e : new Error('Scheduled action threw falsy error');\n }\n if (errored) {\n this.unsubscribe();\n return errorValue;\n }\n }\n\n unsubscribe() {\n if (!this.closed) {\n const { id, scheduler } = this;\n const { actions } = scheduler;\n\n this.work = this.state = this.scheduler = null!;\n this.pending = false;\n\n arrRemove(actions, this);\n if (id != null) {\n this.id = this.recycleAsyncId(scheduler, id, null);\n }\n\n this.delay = null!;\n super.unsubscribe();\n }\n }\n}\n", "import { Action } from './scheduler/Action';\nimport { Subscription } from './Subscription';\nimport { SchedulerLike, SchedulerAction } from './types';\nimport { dateTimestampProvider } from './scheduler/dateTimestampProvider';\n\n/**\n * An execution context and a data structure to order tasks and schedule their\n * execution. Provides a notion of (potentially virtual) time, through the\n * `now()` getter method.\n *\n * Each unit of work in a Scheduler is called an `Action`.\n *\n * ```ts\n * class Scheduler {\n * now(): number;\n * schedule(work, delay?, state?): Subscription;\n * }\n * ```\n *\n * @class Scheduler\n * @deprecated Scheduler is an internal implementation detail of RxJS, and\n * should not be used directly. Rather, create your own class and implement\n * {@link SchedulerLike}. Will be made internal in v8.\n */\nexport class Scheduler implements SchedulerLike {\n public static now: () => number = dateTimestampProvider.now;\n\n constructor(private schedulerActionCtor: typeof Action, now: () => number = Scheduler.now) {\n this.now = now;\n }\n\n /**\n * A getter method that returns a number representing the current time\n * (at the time this function was called) according to the scheduler's own\n * internal clock.\n * @return {number} A number that represents the current time. May or may not\n * have a relation to wall-clock time. May or may not refer to a time unit\n * (e.g. milliseconds).\n */\n public now: () => number;\n\n /**\n * Schedules a function, `work`, for execution. May happen at some point in\n * the future, according to the `delay` parameter, if specified. May be passed\n * some context object, `state`, which will be passed to the `work` function.\n *\n * The given arguments will be processed an stored as an Action object in a\n * queue of actions.\n *\n * @param {function(state: ?T): ?Subscription} work A function representing a\n * task, or some unit of work to be executed by the Scheduler.\n * @param {number} [delay] Time to wait before executing the work, where the\n * time unit is implicit and defined by the Scheduler itself.\n * @param {T} [state] Some contextual data that the `work` function uses when\n * called by the Scheduler.\n * @return {Subscription} A subscription in order to be able to unsubscribe\n * the scheduled work.\n */\n public schedule(work: (this: SchedulerAction, state?: T) => void, delay: number = 0, state?: T): Subscription {\n return new this.schedulerActionCtor(this, work).schedule(state, delay);\n }\n}\n", "import { Scheduler } from '../Scheduler';\nimport { Action } from './Action';\nimport { AsyncAction } from './AsyncAction';\nimport { TimerHandle } from './timerHandle';\n\nexport class AsyncScheduler extends Scheduler {\n public actions: Array> = [];\n /**\n * A flag to indicate whether the Scheduler is currently executing a batch of\n * queued actions.\n * @type {boolean}\n * @internal\n */\n public _active: boolean = false;\n /**\n * An internal ID used to track the latest asynchronous task such as those\n * coming from `setTimeout`, `setInterval`, `requestAnimationFrame`, and\n * others.\n * @type {any}\n * @internal\n */\n public _scheduled: TimerHandle | undefined;\n\n constructor(SchedulerAction: typeof Action, now: () => number = Scheduler.now) {\n super(SchedulerAction, now);\n }\n\n public flush(action: AsyncAction): void {\n const { actions } = this;\n\n if (this._active) {\n actions.push(action);\n return;\n }\n\n let error: any;\n this._active = true;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions.shift()!)); // exhaust the scheduler queue\n\n this._active = false;\n\n if (error) {\n while ((action = actions.shift()!)) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\n/**\n *\n * Async Scheduler\n *\n * Schedule task as if you used setTimeout(task, duration)\n *\n * `async` scheduler schedules tasks asynchronously, by putting them on the JavaScript\n * event loop queue. It is best used to delay tasks in time or to schedule tasks repeating\n * in intervals.\n *\n * If you just want to \"defer\" task, that is to perform it right after currently\n * executing synchronous code ends (commonly achieved by `setTimeout(deferredTask, 0)`),\n * better choice will be the {@link asapScheduler} scheduler.\n *\n * ## Examples\n * Use async scheduler to delay task\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * const task = () => console.log('it works!');\n *\n * asyncScheduler.schedule(task, 2000);\n *\n * // After 2 seconds logs:\n * // \"it works!\"\n * ```\n *\n * Use async scheduler to repeat task in intervals\n * ```ts\n * import { asyncScheduler } from 'rxjs';\n *\n * function task(state) {\n * console.log(state);\n * this.schedule(state + 1, 1000); // `this` references currently executing Action,\n * // which we reschedule with new state and delay\n * }\n *\n * asyncScheduler.schedule(task, 3000, 0);\n *\n * // Logs:\n * // 0 after 3s\n * // 1 after 4s\n * // 2 after 5s\n * // 3 after 6s\n * ```\n */\n\nexport const asyncScheduler = new AsyncScheduler(AsyncAction);\n\n/**\n * @deprecated Renamed to {@link asyncScheduler}. Will be removed in v8.\n */\nexport const async = asyncScheduler;\n", "import { AsyncAction } from './AsyncAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\nimport { SchedulerAction } from '../types';\nimport { animationFrameProvider } from './animationFrameProvider';\nimport { TimerHandle } from './timerHandle';\n\nexport class AnimationFrameAction extends AsyncAction {\n constructor(protected scheduler: AnimationFrameScheduler, protected work: (this: SchedulerAction, state?: T) => void) {\n super(scheduler, work);\n }\n\n protected requestAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle {\n // If delay is greater than 0, request as an async action.\n if (delay !== null && delay > 0) {\n return super.requestAsyncId(scheduler, id, delay);\n }\n // Push the action to the end of the scheduler queue.\n scheduler.actions.push(this);\n // If an animation frame has already been requested, don't request another\n // one. If an animation frame hasn't been requested yet, request one. Return\n // the current animation frame request id.\n return scheduler._scheduled || (scheduler._scheduled = animationFrameProvider.requestAnimationFrame(() => scheduler.flush(undefined)));\n }\n\n protected recycleAsyncId(scheduler: AnimationFrameScheduler, id?: TimerHandle, delay: number = 0): TimerHandle | undefined {\n // If delay exists and is greater than 0, or if the delay is null (the\n // action wasn't rescheduled) but was originally scheduled as an async\n // action, then recycle as an async action.\n if (delay != null ? delay > 0 : this.delay > 0) {\n return super.recycleAsyncId(scheduler, id, delay);\n }\n // If the scheduler queue has no remaining actions with the same async id,\n // cancel the requested animation frame and set the scheduled flag to\n // undefined so the next AnimationFrameAction will request its own.\n const { actions } = scheduler;\n if (id != null && actions[actions.length - 1]?.id !== id) {\n animationFrameProvider.cancelAnimationFrame(id as number);\n scheduler._scheduled = undefined;\n }\n // Return undefined so the action knows to request a new async id if it's rescheduled.\n return undefined;\n }\n}\n", "import { AsyncAction } from './AsyncAction';\nimport { AsyncScheduler } from './AsyncScheduler';\n\nexport class AnimationFrameScheduler extends AsyncScheduler {\n public flush(action?: AsyncAction): void {\n this._active = true;\n // The async id that effects a call to flush is stored in _scheduled.\n // Before executing an action, it's necessary to check the action's async\n // id to determine whether it's supposed to be executed in the current\n // flush.\n // Previous implementations of this method used a count to determine this,\n // but that was unsound, as actions that are unsubscribed - i.e. cancelled -\n // are removed from the actions array and that can shift actions that are\n // scheduled to be executed in a subsequent flush into positions at which\n // they are executed within the current flush.\n const flushId = this._scheduled;\n this._scheduled = undefined;\n\n const { actions } = this;\n let error: any;\n action = action || actions.shift()!;\n\n do {\n if ((error = action.execute(action.state, action.delay))) {\n break;\n }\n } while ((action = actions[0]) && action.id === flushId && actions.shift());\n\n this._active = false;\n\n if (error) {\n while ((action = actions[0]) && action.id === flushId && actions.shift()) {\n action.unsubscribe();\n }\n throw error;\n }\n }\n}\n", "import { AnimationFrameAction } from './AnimationFrameAction';\nimport { AnimationFrameScheduler } from './AnimationFrameScheduler';\n\n/**\n *\n * Animation Frame Scheduler\n *\n * Perform task when `window.requestAnimationFrame` would fire\n *\n * When `animationFrame` scheduler is used with delay, it will fall back to {@link asyncScheduler} scheduler\n * behaviour.\n *\n * Without delay, `animationFrame` scheduler can be used to create smooth browser animations.\n * It makes sure scheduled task will happen just before next browser content repaint,\n * thus performing animations as efficiently as possible.\n *\n * ## Example\n * Schedule div height animation\n * ```ts\n * // html:
\n * import { animationFrameScheduler } from 'rxjs';\n *\n * const div = document.querySelector('div');\n *\n * animationFrameScheduler.schedule(function(height) {\n * div.style.height = height + \"px\";\n *\n * this.schedule(height + 1); // `this` references currently executing Action,\n * // which we reschedule with new state\n * }, 0, 0);\n *\n * // You will see a div element growing in height\n * ```\n */\n\nexport const animationFrameScheduler = new AnimationFrameScheduler(AnimationFrameAction);\n\n/**\n * @deprecated Renamed to {@link animationFrameScheduler}. Will be removed in v8.\n */\nexport const animationFrame = animationFrameScheduler;\n", "import { Observable } from '../Observable';\nimport { SchedulerLike } from '../types';\n\n/**\n * A simple Observable that emits no items to the Observer and immediately\n * emits a complete notification.\n *\n * Just emits 'complete', and nothing else.\n *\n * ![](empty.png)\n *\n * A simple Observable that only emits the complete notification. It can be used\n * for composing with other Observables, such as in a {@link mergeMap}.\n *\n * ## Examples\n *\n * Log complete notification\n *\n * ```ts\n * import { EMPTY } from 'rxjs';\n *\n * EMPTY.subscribe({\n * next: () => console.log('Next'),\n * complete: () => console.log('Complete!')\n * });\n *\n * // Outputs\n * // Complete!\n * ```\n *\n * Emit the number 7, then complete\n *\n * ```ts\n * import { EMPTY, startWith } from 'rxjs';\n *\n * const result = EMPTY.pipe(startWith(7));\n * result.subscribe(x => console.log(x));\n *\n * // Outputs\n * // 7\n * ```\n *\n * Map and flatten only odd numbers to the sequence `'a'`, `'b'`, `'c'`\n *\n * ```ts\n * import { interval, mergeMap, of, EMPTY } from 'rxjs';\n *\n * const interval$ = interval(1000);\n * const result = interval$.pipe(\n * mergeMap(x => x % 2 === 1 ? of('a', 'b', 'c') : EMPTY),\n * );\n * result.subscribe(x => console.log(x));\n *\n * // Results in the following to the console:\n * // x is equal to the count on the interval, e.g. (0, 1, 2, 3, ...)\n * // x will occur every 1000ms\n * // if x % 2 is equal to 1, print a, b, c (each on its own)\n * // if x % 2 is not equal to 1, nothing will be output\n * ```\n *\n * @see {@link Observable}\n * @see {@link NEVER}\n * @see {@link of}\n * @see {@link throwError}\n */\nexport const EMPTY = new Observable((subscriber) => subscriber.complete());\n\n/**\n * @param scheduler A {@link SchedulerLike} to use for scheduling\n * the emission of the complete notification.\n * @deprecated Replaced with the {@link EMPTY} constant or {@link scheduled} (e.g. `scheduled([], scheduler)`). Will be removed in v8.\n */\nexport function empty(scheduler?: SchedulerLike) {\n return scheduler ? emptyScheduled(scheduler) : EMPTY;\n}\n\nfunction emptyScheduled(scheduler: SchedulerLike) {\n return new Observable((subscriber) => scheduler.schedule(() => subscriber.complete()));\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport function isScheduler(value: any): value is SchedulerLike {\n return value && isFunction(value.schedule);\n}\n", "import { SchedulerLike } from '../types';\nimport { isFunction } from './isFunction';\nimport { isScheduler } from './isScheduler';\n\nfunction last(arr: T[]): T | undefined {\n return arr[arr.length - 1];\n}\n\nexport function popResultSelector(args: any[]): ((...args: unknown[]) => unknown) | undefined {\n return isFunction(last(args)) ? args.pop() : undefined;\n}\n\nexport function popScheduler(args: any[]): SchedulerLike | undefined {\n return isScheduler(last(args)) ? args.pop() : undefined;\n}\n\nexport function popNumber(args: any[], defaultValue: number): number {\n return typeof last(args) === 'number' ? args.pop()! : defaultValue;\n}\n", "export const isArrayLike = ((x: any): x is ArrayLike => x && typeof x.length === 'number' && typeof x !== 'function');", "import { isFunction } from \"./isFunction\";\n\n/**\n * Tests to see if the object is \"thennable\".\n * @param value the object to test\n */\nexport function isPromise(value: any): value is PromiseLike {\n return isFunction(value?.then);\n}\n", "import { InteropObservable } from '../types';\nimport { observable as Symbol_observable } from '../symbol/observable';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being Observable (but not necessary an Rx Observable) */\nexport function isInteropObservable(input: any): input is InteropObservable {\n return isFunction(input[Symbol_observable]);\n}\n", "import { isFunction } from './isFunction';\n\nexport function isAsyncIterable(obj: any): obj is AsyncIterable {\n return Symbol.asyncIterator && isFunction(obj?.[Symbol.asyncIterator]);\n}\n", "/**\n * Creates the TypeError to throw if an invalid object is passed to `from` or `scheduled`.\n * @param input The object that was passed.\n */\nexport function createInvalidObservableTypeError(input: any) {\n // TODO: We should create error codes that can be looked up, so this can be less verbose.\n return new TypeError(\n `You provided ${\n input !== null && typeof input === 'object' ? 'an invalid object' : `'${input}'`\n } where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.`\n );\n}\n", "export function getSymbolIterator(): symbol {\n if (typeof Symbol !== 'function' || !Symbol.iterator) {\n return '@@iterator' as any;\n }\n\n return Symbol.iterator;\n}\n\nexport const iterator = getSymbolIterator();\n", "import { iterator as Symbol_iterator } from '../symbol/iterator';\nimport { isFunction } from './isFunction';\n\n/** Identifies an input as being an Iterable */\nexport function isIterable(input: any): input is Iterable {\n return isFunction(input?.[Symbol_iterator]);\n}\n", "import { ReadableStreamLike } from '../types';\nimport { isFunction } from './isFunction';\n\nexport async function* readableStreamLikeToAsyncGenerator(readableStream: ReadableStreamLike): AsyncGenerator {\n const reader = readableStream.getReader();\n try {\n while (true) {\n const { value, done } = await reader.read();\n if (done) {\n return;\n }\n yield value!;\n }\n } finally {\n reader.releaseLock();\n }\n}\n\nexport function isReadableStreamLike(obj: any): obj is ReadableStreamLike {\n // We don't want to use instanceof checks because they would return\n // false for instances from another Realm, like an + +
+

Links and information required to join the event virtually will be delivered via e-mail.

+

When

+

The event will run all day on Saturday, April 22, 2023.

+

Register

+
+
+
Online
+
+ $0.00 +
+ +
    +
  • 📺   Watch the talks live online +
  • 📹   Review recordings afterwards +
+ +
+ RSVP +
+ +
+
+
In-Person
+
+ $350 +
+ +
    +
  • 🗣   Watch the talks live in real life +
  • 🤝   Network with builders and vendors in ML +
  • 👕   Get some conference swag +
  • 🥞   Eat food +
  • 🔜   Register by April 15th! +
+ + + +
+
+ +

Sponsors

+

We're currently looking for organizations that are interested +in sponsoring this event!

+

See this page +for details +and contact +sponsorships@fullstackdeeplearning.com +with inquiries.

+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + + + + + + + + + + + + +
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/conference/sponsors/index.html b/conference/sponsors/index.html new file mode 100644 index 00000000..af212b77 --- /dev/null +++ b/conference/sponsors/index.html @@ -0,0 +1,2182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Info for Sponsors - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + + + + +

Info for Sponsors

+

We offer three tiers of sponsorship for the FSDL 2023 Conference:

+
    +
  1. +

    Vector Tier sponsors receive the following benefits: logo displayed on website and during virtual conference, verbal acknowledgement , and one registration. Vector Tier sponsorships are available for $500.

    +
  2. +
  3. +

    Matrix Tier sponsors receive all the benefits of the Vector Tier, plus: logo displayed in between talks, 8' by 10' area for booth setup, and an additional registration (for a total of two). Matrix Tier sponsorships are available for $1500.

    +
  4. +
  5. +

    Tensor Tier sponsors receive all the benefits of the Matrix Tier, plus: logo displayed on a banner in the registration area, access to an opt-in database of attendees who are interested in tooling or recruitment, and two additional registrations (for a toral of four). Tensor Tier sponsorships are available for $2500.

    +
  6. +
+

Contact +sponsorships@fullstackdeeplearning.com +if you're interested in sponsoring the conference!

+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/announcement/index.html b/course/2022/announcement/index.html new file mode 100644 index 00000000..9f65eade --- /dev/null +++ b/course/2022/announcement/index.html @@ -0,0 +1,2534 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Course Announcement - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Full Stack Deep Learning 2022 Course Announcement

+
+

Info

+

Looking for the latest edition of the course? Click here.

+

Want to be among the first to hear about future iterations of the course? +Simply enter your email below, follow us on + Twitter, +or subscribe to our + YouTube channel.

+

+

+
+
+
+ + + +
+
+ + +
+ +
+
+
+

+ +
+

What you will learn

+

Our course incorporates feedback from thousands of learners over 5 years of +teaching production machine learning.

+

We've updated all materials with best practices as of 2022 and added some brand new topics:

+
    +
  • Formulating the problem and estimating project cost
  • +
  • Sourcing, cleaning, processing, labeling, synthesizing, and augmenting data
  • +
  • Picking the right framework and compute infrastructure
  • +
  • Troubleshooting training and ensuring reproducibility
  • +
  • Deploying the model at scale
  • +
  • ✨ Monitoring and continually improving the deployed model ✨
  • +
  • ✨ How ML teams work and how to manage ML projects ✨
  • +
  • ✨ Building on Large Language Models and other Foundation Models ✨
  • +
+

+ +

+ +

Labs

+

Through the first few weeks of the course, you will construct an end-to-end ML system, with best practices as of 2022.

+ +

Project

+

Once the labs are done, you'll begin working on your own portfolio project. +You'll create a working a working ML-powered application of your choice and share it with your fellow learners, +getting feedback from course staff.

+

Selected projects will get the opportunity to share their work with the broader FSDL community.

+

Who is this for

+

ML Researchers and Engineers, +MS students, +software engineers looking to get into ML, +data scientists looking to up their software engineering game, +and PMs on ML teams will all benefit from materials in our course.

+

You will get the most out of this course if you have:

+
    +
  • At least one year of experience programming in Python.
  • +
  • At least one deep learning course (at a university or online).
  • +
  • Experience with code versioning, Unix environments, and software engineering.
  • +
  • At least four hours a week to commit to learning, split across lectures, Q&A, labs, reading, and project work.
  • +
+

We review the fundamentals of deep learning (backprop, MLPs, CNNs, Transformers) in supplementary lectures released before the start of the course — but you should not expect to learn this material for the first time from these.

+

Testimonials

+ + +

Instructors

+
+

Photo of Charles Frye +

+
+

Charles Frye teaches people on the internet. He worked in education and growth at Weights & Biases after getting a PhD in Neuroscience at UC Berkeley. He now works as a consultant, including for Gantry. +

+
+
+
+

Photo of Sergey Karayev +

+
+

Sergey Karayev is Co-founder of Volition. He co-founded Gradescope after getting a PhD in Computer Vision at UC Berkeley. +

+
+
+
+

Photo of Josh Tobin +

+
+

Josh Tobin is Co-founder and CEO of Gantry. He worked as a Research Scientist at OpenAI and received a PhD in AI at UC Berkeley. +

+
+
+

Registration

+
+
+
On-Your-Own
+
+ $0.00 +
+ + +
    +
  • 📹   Lecture and lab videos +
+ + + +
+
+
Cohort-Based
+
+ $495 +
+
+
50% discount for students and academics
+
+ +
    +
  • 📹   Lecture and lab videos +
  • 🗣   Private Discord with instructors +
  • ❓   Weekly live online Q&A +
  • 💲   Compute credits +
  • 🎨   Team-based portfolio project with feedback +
  • 🏆   Certificate of completion +
  • 🔜   Starts August 8th, 2022 +
+ +
+
Registration Closed
+
+ +
+
+ +
+

Bummed that you missed registration for the 2022 course?

+

To make sure you hear about the next iteration, +enter your email below to sign up for our mailing list.

+

+ +

+
+
+ + + +
+
+
+
+

+
+

Cohort-Based Course FAQs

+

Read more about the 2022 synchronous cohort +here.

+

How long is the course? How much time do I need to devote to it?

+

The course is 10 weeks long, from August 8, 2022 to October 14, 2022. +During that time, you should expect to spend about four hours a week on the course. +In the last four weeks, during which you'll be working on the project, +devoting more time will result in a final project that you can be even more proud to put in your portfolio.

+

Which components of the course are synchronous? What days and times will synchronous events occur?

+

The Q&A sessions are synchronous. Their timing will be set based on student availability.

+

Lectures are pre-recorded, with the Q&A session serving as a live +"flipped classroom" +for discussing the lecture content.

+

Labs are available online, with a pre-recorded walkthrough video. +We encourage students to form study groups, via the Discord server, +for working on the lab material.

+

How do the projects work?

+

Teams of five will work for four weeks to build an ML-powered application +of their choosing: a chatbot, an image editor, a natural language speech interface, +or anything else you can imagine building with ML.

+

At the end, they will present their work to the rest of the class.

+

Project proposals and final projects will be reviewed by course staff. +We'll provide detailed feedback, but grading will be done based on completion. +The best projects will be featured by FSDL on YouTube, Twitter, and at our summits.

+
+

Questions about registration or the class that aren't answered here?

+

Tweet at us @full_stack_dl or email admin@fullstackdeeplearning.com.

+
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/cohort/index.html b/course/2022/cohort/index.html new file mode 100644 index 00000000..3f55abaa --- /dev/null +++ b/course/2022/cohort/index.html @@ -0,0 +1,2259 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Synchronous Cohort Option - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Synchronous Cohort Option

+

We organized the synchronous cohort for the 2022 course via Discord.

+

It's project time! How do I access my free GPUs?

+

Check out the instructions +here.

+

How do I know if I'm fully registered?

+

If you completed your registration for the course, +you should be able to access the #course-announcements channel +here.

+

Please email us at registration@fullstackdeeplearning.com +if you registered for the course but cannot access the Discord.

+

I've registered and I'm in the Discord. How do I follow along?

+

For a quick list of the most important links for cohort members, see +this Discord post. +You can add the private course calendar linked there +to your own so you never miss an event.

+

For a tour of the Discord, plus some tips and tricks, +check out the video below:

+ + +

How do I control my notifications in Discord?

+

To stay apprised of updates without getting overwhelmed, we suggest these notification settings: +notification settings

+

You can find them here in the Discord web client: +notification-menu

+

Only course staff can use the remaining notification stream, @mentions.

+

If you turn off @role/@everyone/@here, you won't see announcements about live events and changes of schedule.

+

If you want to further reduce distractions, turn off the notification dot.

+

You can also mute individual channels, leaving only #course-announcements off mute.

+

Only instructors can post in the #course-announcements channel.

+

If you furthermore hide muted channels, the Discord interface will only show that channel, but you'll miss out on the chance to discover discussions in other channels.

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/index.html b/course/2022/index.html new file mode 100644 index 00000000..9f711dc9 --- /dev/null +++ b/course/2022/index.html @@ -0,0 +1,2539 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + FSDL 2022 - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + + + +
+
+ + + + + + + + +

Full Stack Deep Learning - 2022 Course

+
+

Course Completed

+

All the lecture and lab material is free forever. Just check out the links below.

+

To be among the first to hear about future iterations of the course, +simply enter your email below, follow us on + Twitter, +or subscribe to our + YouTube channel.

+

+ +

+
+
+ + + +
+
+
+
+

+
+

Schedule

+

We released lecture videos on Mondays at 6pm Pacific and lab videos on Wednesdays at 6pm Pacific on + YouTube.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
WeekLectureLabProject
2022.07.25Pre-Labs 1-3: CNNs, Transformers, and PyTorch+Lightning-
2022.08.08Lecture 1: Course Vision and When to Use MLLab Overview-
2022.08.15Lecture 2: Development Infrastructure & ToolingLab 4: Experiment Management-
2022.08.22Lecture 3: Troubleshooting & TestingLab 5: Troubleshooting & Testing-
2022.08.29Lecture 4: Data ManagementLab 6: Data AnnotationStart forming groups
2022.09.05Lecture 5: DeploymentLab 7: Web DeploymentGroup proposals due
2022.09.12Lecture 6: Continual LearningLab 8: Model MonitoringWork on project
2022.09.19Lecture 7: Foundation ModelsWork on project
2022.09.26Lecture 8: ML Teams and Project ManagementWork on project
2022.10.03Lecture 9: EthicsWork on project
2022.10.10Project PresentationsProject due
+

Detailed Contents

+

Pre-Labs 1-3: CNNs, Transformers, PyTorch Lightning

+

We review some prerequisites -- +the DNN architectures we'll be using and basic model training with PyTorch -- +and introduce PyTorch Lightning. +Published August 10, 2022.

+

Lecture 1: Course Vision and When to Use ML

+

We review the purpose of the course and consider when it's a good (or bad!) idea to use ML. +Published August 8, 2022.

+

Lab Overview

+

We walk through the entire architecture of the application we will be building, from soup to nuts. +Published July 25, 2022.

+

Lecture 2: Development Infrastructure & Tooling

+

We tour the landscape of infrastructure and tooling for developing deep learning models. +Published August 15, 2022.

+

Lab 4: Experiment Management

+

We run, track, and manage model development experiments with Weights & Biases. +Published August 17, 2022.

+

Lecture 3: Troubleshooting & Testing

+

We look at tools and practices for testing software in general +and ML models in particular. +Published August 22, 2022.

+

Lab 5: Troubleshooting & Testing

+

We try out some Python testing tools and dissect a PyTorch trace to +learn performance troubleshooting techniques. +Published August 24, 2022.

+

Lecture 4: Data Management

+

We look at sourcing, storing, exploring, processing, labeling, and versioning data for deep learning. +Published August 29, 2022.

+

Lab 6: Data Annotation

+

We spin up a data annotation server and learn +just how messy data really is. +Published August 31, 2022.

+

Lecture 5: Data Management

+

We do a lightning tour of all the ways models are deployed +and do a deep dive on running models as web services. +Published September 5, 2022.

+

Lab 7: Web Deployment

+

We create and deploy our ML-powered +text recognition application +with a simple web UI and a serverless model service. +Published September 7, 2022.

+

Lecture 6: Continual Learning

+

We consider what it takes to build a +continual learning system around +an ML-powered application. +Published September 12, 2022.

+

Lab 8: Model Monitoring

+

We add user feedback to our ML application +and review data logged by actual users of the +FSDL Text Recognizer. +Published September 14, 2022.

+

Lecture 7: Foundation Models

+

We look at how to build on GPT-3, CLIP, StableDiffusion, and other large models. +Published September 19, 2022.

+

Lecture 8: ML Teams and Project Management

+

We look at the structure of ML teams and projects, +including how to hire or get hired on an ML team +and how to build an ML-first organization. +Published September 26, 2022.

+

Lecture 9: Ethics

+

We consider ethical concerns around buiding technlogy, +building with machine learning, and building artificial intelligence. +Published October 3, 2022.

+

Teaching Assistants

+

This course was only possible with the support of our amazing TAs (in alphabetical order):

+
    +
  • Andrew Mendez is a Deep Learning Solutions Engineer at DeterminedAI, working on computer vision and NLP solutions for defense and autonomous vehicle companies. Previously Andrew worked as an ML Engineer at Clarifai and CACI.
  • +
  • Daniel Hen is a Senior Data Scientist at Digital Turbine, working on Ad Tech and mobile solutions, as well as Big Data problems. Working with Spark, ML algorithms such as XGBoost, Computer Vision, and constantly learning new technology.
  • +
  • James Le runs Data Relations and Partnerships at Superb AI, a data management platform for computer vision use cases. Outside work, he writes data-centric blog posts, hosts a data-focused podcast, and organizes in-person events for the data community.
  • +
  • Saurabh Bipin Chandra is a Senior ML Scientist at Turnitin.
  • +
  • Sayak Paul is as a Machine Learning Engineer at Carted on NLP and representation learning from HTML webpages. Besides work, he contributes to various open-source projects.
  • +
  • Vishnu Rachakonda is a Data Scientist at firsthand.
  • +
+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lab-0-overview/index.html b/course/2022/lab-0-overview/index.html new file mode 100644 index 00000000..c4dc714e --- /dev/null +++ b/course/2022/lab-0-overview/index.html @@ -0,0 +1,2427 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab Overview - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+ +
+ + + +
+
+ + + + + + + + +

Lab Overview

+
+ +
+ +
+

By Charles Frye. Published July 25, 2022.

+
+
+ +
+ +

What are these labs for?

+

In the lab portion of Full Stack Deep Learning 2022, +we will incrementally develop a complete codebase +to train a deep neural network to recognize characters in hand-written paragraphs +and deploy it inside a simple web application.

+

These labs act as an opportunity to work through the nitty-gritty details that come up +when implementing some of the recommendations given in the lectures in a concrete system. +It's also a chance for you to gain familiarity with some of the tools we recommend +in the lectures.

+

This lab reviews the overall architecture of the system.

+

Architecture of the Text Recognizer

+

Software architectures are inherently about trade-offs: +decisions that make for better scaling might make for worse security or +tools that encourage faster iteration might reduce transparency.

+

We design our architecture with agility and simplicity as the prime directives. +We choose simplicity in order to empower individuals to understand the "full stack" of the application, +from GPUs crunching tensors in model development up to serverless cloud functions acting on requests in production. +And we choose agility so that individual is able to quickly iterate on the application, +especially in response to user feedback.

+

Architecture Diagram

+

We put together a handy architecture diagram summarizing the application here:

+
+ +
+ +

For a guided tour of this architecture, watch the video at the top of the page or +click the badge below to open an interactive Jupyter notebook on Google Colab:

+
+ +
+ +

Running the labs

+

One-click setup on Colab

+

To make it as easy as possible to run the labs, +we've made them compatible with +Google Colab.

+

Wherever you see an "Open in Colab" badge, like the one below, +just click on it and you'll be dropped into a hosted notebook environment for the lab, +complete with free GPU. +The badge below opens the first main-track lab, +Lab 4 on experiment management.

+
+ +
+ +

You can read more +here.

+

Setup on your own Linux machine

+
+ +
+ +

If you have a Linux machine with an NVIDIA GPU and drivers, +either locally or in the cloud, +you can also run the labs there. +The video above and text instructions +here +should be enough to get you going.

+
+

Don't get stuck on setup!

+

Remember that Google Colab is always there as an option if you run into issues while setting up.

+

Rather than getting frustrated with some obnoxious library linking or driver issue that's irrelevant +to the material you are really trying to learn and getting stuck in an installation quagmire, +just run the labs on Colab so you can get back to learning about machine learning!

+
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lab-4-experiment-management/index.html b/course/2022/lab-4-experiment-management/index.html new file mode 100644 index 00000000..bbf75e1f --- /dev/null +++ b/course/2022/lab-4-experiment-management/index.html @@ -0,0 +1,2278 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 4: Experiment Management - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Lab 4: Experiment Management

+
+ +
+ +
+

By Charles Frye. Published August 17, 2022.

+
+
+ +
+ +

In this lab, +we'll work through +an entire experiment management workflow +for model development, +using a tool called Weights & Biases.

+

Outline

+
    +
  • 00:00 Why do we need experiment management?
  • +
  • 02:24 Tracking experiments with TensorBoard
  • +
  • 04:16 Experiment management with Weights & Biases
  • +
  • 06:48 A guided tour of the W&B run interface
  • +
  • 12:12 Exploratory data analysis with W&B Tables
  • +
  • 14:00 Project management with W&B
  • +
  • 16:27 Artifact versioning with W&B
  • +
  • 18:52 Programmatic API access to W&B
  • +
  • 20:14 Collaboration tools in W&B
  • +
  • 25:00 Hyperparameter sweeps in W&B
  • +
  • 28:15 Overview of exercises
  • +
+
+

Wait, what happened to labs 1 through 3?

+

The first three labs review some pre-requisites for the course -- +DNN architectures and the basics of model training.

+

You can find them +here.

+

If you're already basically familiar with training neural networks +in any framework, you really only need to review +Lab 02a, +on using PyTorch Lightning.

+
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lab-5-troubleshooting-and-testing/index.html b/course/2022/lab-5-troubleshooting-and-testing/index.html new file mode 100644 index 00000000..4c8f6437 --- /dev/null +++ b/course/2022/lab-5-troubleshooting-and-testing/index.html @@ -0,0 +1,2266 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 5: Troubleshooting & Testing - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Lab 5: Troubleshooting & Testing

+
+ +
+ +
+

By Charles Frye. Published August 24, 2022.

+
+
+ +
+ +

In this lab, +we'll check out the basic tools +required to write clean Python code +and +see how to write memorization tests +for training code in PyTorch Lightning. +Then we'll take a deep dive into +the trace of a PyTorch training step +and use it to debug performance issues +in GPU-accelerated code.

+

Outline

+
    +
  • 00:00 Overview
  • +
  • 00:51 Linting: pre-commit, black, flake8
  • +
  • 05:42 Testing: pytest, doctest, memorization testing
  • +
  • 11:15 Troubleshooting PyTorch performance
  • +
  • 16:13 A guided tour of a PyTorch trace
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lab-6-data-annotation/index.html b/course/2022/lab-6-data-annotation/index.html new file mode 100644 index 00000000..04db342a --- /dev/null +++ b/course/2022/lab-6-data-annotation/index.html @@ -0,0 +1,2265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 6: Data Annotation - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Lab 6: Data Annotation

+
+ +
+ +
+

By Charles Frye. Published August 31, 2022.

+
+
+ +
+ +

In this lab, +we'll see how raw data becomes useful data +via data annotation +and how structured data stored on disk +becomes neural network-ready +with preprocessing and PyTorch Datasets.

+

We'll also spin up a data annotation server using +Label Studio.

+

Outline

+
    +
  • 00:00 Overview
  • +
  • 00:36 Loading annotated data and synthesizing data
  • +
  • 02:39 Setting up a data annotation server with Label Studio
  • +
  • 06:54 Uploading data to Label Studio
  • +
  • 09:15 Building and using an annotation interface in Label Studio
  • +
  • 13:17 Exercises
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lab-7-web-deployment/index.html b/course/2022/lab-7-web-deployment/index.html new file mode 100644 index 00000000..3c8af727 --- /dev/null +++ b/course/2022/lab-7-web-deployment/index.html @@ -0,0 +1,2267 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 7: Web Deployment - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Lab 7: Web Deployment

+
+ +
+ +
+

By Charles Frye. Published September 7, 2022.

+
+
+ +
+ +

In this lab, +we'll take the leap +from ML model +to ML-powered application +by packaging our text recognizer +into a portable TorchSript binary, +wrapping that binary up into a serverless cloud function, +and building a simple UI in Python with gradio.

+

Outline

+
    +
  • 00:00 Overview
  • +
  • 01:06 Compiling the model to TorchScript
  • +
  • 06:00 Why not deploy on GPUs?
  • +
  • 08:58 Building a GUI with gradio
  • +
  • 15:34 Spinning up a model service
  • +
  • 21:11 Creating a public URL with ngrok
  • +
  • 24:52 Writing a Dockerfile for our server
  • +
  • 30:06 Recap
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lab-8-model-monitoring/index.html b/course/2022/lab-8-model-monitoring/index.html new file mode 100644 index 00000000..4750c94a --- /dev/null +++ b/course/2022/lab-8-model-monitoring/index.html @@ -0,0 +1,2265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 8: Model Monitoring - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Lab 8: Model Monitoring

+
+ +
+ +
+

By Charles Frye. Published September 14, 2022.

+
+
+ +
+ +

In this lab, +we'll add flagging to our ML-powered application +so that users can give us feedback.

+

Then, we'll explore some data logged based on feedback from +actual users of the +FSDL Text Recognizer +to the model monitoring and continual learning platform +Gantry.

+

Outline

+
    +
  • 00:00 Basic user feedback with gradio
  • +
  • 04:51 Logging feedback to Gantry
  • +
  • 08:34 Checking for model toxicity with Gantry projections
  • +
  • 14:23 Detecting model bugs in the Gantry UI with distributions and filters
  • +
  • 19:01 Discovering surprising user data in the Gantry UI
  • +
  • 29:53 Outro
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/labs-1-3-cnns-transformers-pytorch-lightning/index.html b/course/2022/labs-1-3-cnns-transformers-pytorch-lightning/index.html new file mode 100644 index 00000000..bef7357a --- /dev/null +++ b/course/2022/labs-1-3-cnns-transformers-pytorch-lightning/index.html @@ -0,0 +1,2298 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Pre-Labs 1-3: CNNs, Transformers, PyTorch Lightning - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Pre-Labs 1-3: CNNs, Transformers, PyTorch Lightning

+
+ +
+ +
+

By Charles Frye. Published August 10, 2022.

+
+

This first set of "review" labs covers +deep learning fundamentals and +introduces two of the core libraries we will use for model training: +PyTorch and PyTorch Lightning.

+

These labs are optional -- it's possible to get most +of the value out of the main set of labs +without detailed knowledge of the material here.

+

But if you find yourself getting tripped up +on PyTorch or Lightning details +or on the model architecture, come back and review these labs.

+

The videos above give a quick high-level overview of each lab.

+

If you need a refresher or a deeper dive on any of the topics or libraries, +then work through the notebooks as well. +We recommend only completing the exercises for the labs +that are of most interest to you.

+

Click the badges below to access individual lab notebooks on Colab and videos on YouTube

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LabColabVideo
Lab 01: Deep Neural Networks in PyTorchopen-in-colabyt-logo
Lab 02a: PyTorch Lightningopen-in-colabyt-logo
Lab 02b: Training a CNN on Synthetic Handwriting Dataopen-in-colabyt-logo
Lab 03: Transformers and Paragraphsopen-in-colabyt-logo
+

Running the labs

+

One-click setup on Colab

+

To make it as easy as possible to run the labs, +we've made them compatible with +Google Colab.

+

Wherever you see an "Open in Colab" badge, like the one below, +just click on it and you'll be dropped into a hosted notebook environment for the lab, +complete with free GPU. +The badge below opens Lab 01, on PyTorch.

+
+ +
+ +

You can read more +here.

+

Setup on your own Linux machine

+
+ +
+ +

If you have a Linux machine with an NVIDIA GPU and drivers, +either locally or in the cloud, +you can also run the labs there. +The video above and text instructions +here +should be enough to get you going.

+
+

Don't get stuck on setup!

+

Remember that Google Colab is always there as an option if you run into issues while setting up.

+

Rather than getting frustrated with some obnoxious library linking or driver issue that's irrelevant +to the material you are really trying to learn and getting stuck in an installation quagmire, +just run the labs on Colab so you can get back to learning about machine learning!

+
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/index.html b/course/2022/lecture-1-course-vision-and-when-to-use-ml/index.html new file mode 100644 index 00000000..0d9c41d9 --- /dev/null +++ b/course/2022/lecture-1-course-vision-and-when-to-use-ml/index.html @@ -0,0 +1,3253 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 1: Course Vision and When to Use ML - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 1: Course Vision and When to Use ML

+
+ +
+ +

Lecture by Josh Tobin. +Notes by James Le and Vishnu Rachakonda.
+Published August 8, 2022. +Download slides.

+

1 - Course Vision

+

History of FSDL

+

Full Stack Deep Learning (FSDL) is the course and community for people +who are building products that are powered by machine learning (ML). +It's an exciting time to talk about ML-powered products because ML is +rapidly becoming a mainstream technology - as you can see in startup +funding, job postings, and continued investments of large companies.

+

FSDL was originally started in 2018 when the most exciting ML-powered +products were built by the biggest companies. However, the broader +narrative in the field was that very few companies could get value out +of this technology.

+

Now in 2022, there's a proliferation of powerful products that are +powered by ML. The narrative has shifted as well: There's +standardization that has emerged around the tech stack - with +transformers and NLP starting to seep their way into more use cases, as +well as practices around how to apply ML technologies in the world. One +of the biggest changes in the field in the past four years has been the +emergence of the term MLOps.

+

In addition to the +field being more mature and research continuing to progress, a big +reason for this rapid change is that the training of models is starting to become +commoditized.

+
    +
  • +

    With tools like HuggingFace, you can deploy a state-of-the-art NLP +or CV model in one or two lines of code.

    +
  • +
  • +

    AutoML is starting to work for a lot of applications.

    +
  • +
  • +

    Companies like OpenAI are starting to provide models as a service where you +don't even have to download open-source packages to use them. You +can make a network call to get predictions from a state-of-the-art +model.

    +
  • +
  • +

    Many libraries are starting to standardize around frameworks like Keras and PyTorch +Lightning.

    +
  • +
+

AI Progress

+

The history of ML is characterized by stratospheric rises and meteoric falls of the public +perception of the technology. These were driven by a few different AI +winters that happened over the history of the field - where the +technology didn't live up to its hype. If you project forward a few +years, what will happen to ML?

+

+

Source: 5 Things You Should Know About +AI +(Cambridge Consultants, May 2017)

+

Here are the major categories of possible outcomes and our guess about their likelihoods:

+
    +
  1. +

    A true AI winter, where people +become skeptical about AI as a technology. +We think this is less likely.

    +
  2. +
  3. +

    A slightly more likely outcome is that the overall luster of the +technology starts to wear off, but specific applications are +getting a ton of value out of it.

    +
  4. +
  5. +

    The upside outcome for the field is that AI continues to accelerate +rapidly and becomes pervasive and incredibly effective.

    +
  6. +
+

Our conjecture is that: The way we, as a field, avoid an AI winter is +by translating research progress into real-world products. That's how +we avoid repeating what has happened in the past.

+

ML-Powered Products Require a Different Process

+

Building ML-powered products requires a fundamentally different process +in many ways than developing ML models in an academic setting.

+

+

In academia, you build "flat-earth" ML - selecting a problem, +collecting data, cleaning and labeling the data, iterating on model +development until you have a model that performs well on the dataset +collected, evaluating that model, and writing a report at the end.

+

+

But ML-powered products require an outer loop where after you deploy +the model into production, you measure how that model performs when it +interacts with real users. Then, you use real-world data to +improve your model, setting up a data flywheel that enables +continual improvement.

+

This Course

+

+

This class is about the unique aspects you need to know beyond training +models to build great ML-powered products. Here are some concrete goals +for us:

+
    +
  1. +

    Teaching you generalist skills and an understanding of the +components of ML-powered products (and ML projects more +generally).

    +
  2. +
  3. +

    Teaching you enough MLOps to get things done.

    +
  4. +
  5. +

    Sharing best practices and explaining the motivation behind them.

    +
  6. +
  7. +

    Learning things that might help you with job interviews for ML engineering roles.

    +
  8. +
  9. +

    Forming a community to learn together and from each other.

    +
  10. +
+

We do NOT try to:

+
    +
  1. +

    Teach you ML or software engineering from scratch.

    +
  2. +
  3. +

    Cover the whole breadth of deep learning techniques.

    +
  4. +
  5. +

    Make you an expert in any single aspect of ML.

    +
  6. +
  7. +

    Do research in deep learning.

    +
  8. +
  9. +

    Cover the full spectrum of MLOps.

    +
  10. +
+

If you feel rusty on your pre-requisites but want to get started with +FSDL, here are our recommendations to get up to speed with the +fundamentals:

+ +

ML-Powered Products vs MLOps

+

MLOps, as a discipline, has emerged in just the last few years. It is +about practices for deploying, maintaining, and operating ML systems +that generate ML models in production. A lot of MLOps is about:

+
    +
  • +

    How do we put together an infrastructure that allows us to build +models in a repeatable and governable way?

    +
  • +
  • +

    How can we run ML systems in a potentially high-scale production +setting?

    +
  • +
  • +

    How can we collaborate on these systems as a team?

    +
  • +
+

+

ML-powered product building is a distinct but overlapping discipline. A lot of +what it takes to build a great ML-powered product goes beyond the +infrastructure side of ML systems. It focuses on how to fit ML into the +context of the product or the application that you're building.

+

Other topics in the scope of the ML product discipline include:

+
    +
  • +

    How do you understand how your users are interacting with your +model?

    +
  • +
  • +

    How do you build a team or an organization that can work together +effectively on ML systems?

    +
  • +
  • +

    How do you do product management in the context of ML?

    +
  • +
  • +

    What are the best practices for designing products that use ML as +part of them?

    +
  • +
+

This class focuses on teaching you end-to-end what it takes to get a +product out in the world that uses ML and will cover aspects of MLOps +that are most critical in order to do that.

+

Chapter Summary

+
    +
  1. +

    ML-powered products are going mainstream thanks to the +democratization of modeling.

    +
  2. +
  3. +

    However, building great ML-powered products requires a different +process from building models.

    +
  4. +
  5. +

    Full-Stack Deep Learning is here to help!

    +
  6. +
+

2 - When To Use Machine Learning

+

When to Use ML At All

+

ML projects have a higher failure rate than software projects in +general. One reason that's worth acknowledging is that for many +applications, ML is fundamentally still research. Therefore, we +shouldn't aim for 100% success.

+

Additionally, many ML projects are +doomed to fail even before they are undertaken due to a variety of +reasons:

+
    +
  1. +

    They are technically infeasible or poorly scoped.

    +
  2. +
  3. +

    They never make the leap to a production environment.

    +
  4. +
  5. +

    The broader organization is not all on the same page about what +would be considered success criteria for them.

    +
  6. +
  7. +

    They solve the problem that you set out to solve but do not solve a +big enough problem to be worth their complexity.

    +
  8. +
+

The bar for your ML projects should be that their value must outweigh +not just the cost of developing them but also the additional complexity +that these ML systems introduce to your software (as introduced in the +classic paper "The High-Interest Credit Card of Technical +Debt").

+

In brief, +ML systems erode the boundaries between other systems, rely on expensive +data dependencies, are commonly plagued by system design anti-patterns, +and are subject to the instability of the external world.

+

Before starting an ML project, ask yourself:

+
    +
  1. +

    Are you ready to use ML? More specifically, do you have a +product? Are you collecting data and storing it in a sane way? Do +you have the right people?

    +
  2. +
  3. +

    Do you really need ML to solve this problem? More specifically, +do you need to solve the problem at all? Have you tried using +rules or simple statistics to solve the problem?

    +
  4. +
  5. +

    Is it ethical to use ML to solve this problem? We have a +whole lecture about ethics!

    +
  6. +
+

How to Pick Problems to Solve with ML

+

Just like any other project prioritization, you want to look for use +cases that have high impact and low cost:

+
    +
  1. +

    High-impact problems are likely to be those that address friction in +your product, complex parts of your pipeline, places where cheap +prediction is valuable, and generally what other people in your +industry are doing.

    +
  2. +
  3. +

    Low-cost projects are those with available data, where bad +predictions are not too harmful.

    +
  4. +
+

+

High-Impact Projects

+

Here are some heuristics that you can use to find high-impact ML +projects:

+
    +
  1. +

    Find problems that ML takes from economically infeasible to feasible. +A good resource here is the book "Prediction Machines: +The Simple Economics of +AI." +The book's central thesis is that AI reduces the cost of +prediction, which is central to decision-making. Therefore, look +for projects where making prediction cheaper will have a huge impact.

    +
  2. +
  3. +

    Think about what your product needs. +This article from the ML team at Spotify +talks about the three principles for designing Discover Weekly, +one of Spotify's most powerful and popular ML-powered features.

    +
  4. +
  5. +

    Think about the types of problems that ML is particularly good at. +One common class of problem that is overlooked is +"Software 2.0", +as coined by Andrej Kaparthy. Essentially, if you have a part of your +system that is complex and manually defined, then that's +potentially a good candidate to be automated with ML.

    +
  6. +
  7. +

    Look at what other people in the industry are doing. +Generally, you can read papers and blog posts from both Big Tech and top +earlier-stage companies.

    +
  8. +
+

Low-Cost Projects

+

+

There are three main drivers for how much a project will cost:

+
    +
  1. +

    Data availability: How hard is it to acquire data? How expensive +is data labeling? How much data will be needed? How stable is the +data? What data security requirements do you have?

    +
  2. +
  3. +

    Accuracy requirement: How costly are wrong predictions? How +frequently does the system need to be right to be useful? What are +the ethical implications of your model making wrong predictions? +It is noteworthy that ML project costs tend to scale +super-linearly in the accuracy requirement.

    +
  4. +
  5. +

    Problem difficulty: Is the problem well-defined enough to be +solved with ML? Is there good published work on similar problems? +How much compute does it take to solve the problem? Generally, +it's hard to reason about what's feasible in ML.

    +
  6. +
+

What's Hard in ML?

+

+

Here are the three types of hard problems:

+
    +
  1. +

    Output is complex: The model predictions are ambiguous or in a +high-dimensional structure.

    +
  2. +
  3. +

    Reliability is required: ML systems tend to fail in unexpected +ways, so anywhere you need high precision or high robustness is +going to be more difficult to solve with ML.

    +
  4. +
  5. +

    Generalization is required: These problems tend to be more in +the research domain. They can deal with out-of-distribution data +or do tasks such as reasoning, planning, or understanding +causality.

    +
  6. +
+

ML Feasibility Assessment

+

This is a quick checklist you can use to assess the feasibility of your +ML projects:

+
    +
  1. +

    Make sure that you actually need ML.

    +
  2. +
  3. +

    Put in the work upfront to define success criteria with all of the +stakeholders.

    +
  4. +
  5. +

    Consider the ethics of using ML.

    +
  6. +
  7. +

    Do a literature review.

    +
  8. +
  9. +

    Try to rapidly build a labeled benchmark dataset.

    +
  10. +
  11. +

    Build a "minimum" viable model using manual rules or simple +heuristics.

    +
  12. +
  13. +

    Answer this question again: "Are you sure that you need ML at all?"

    +
  14. +
+

Not All ML Projects Should Be Planned The Same Way

+

Not all ML projects have the same characteristics; therefore, they +shouldn't be planned the same way. Understanding different archetypes of +ML projects can help select the right approach.

+

ML Product Archetypes

+

The three archetypes offered here are defined by how they interact with +real-world use cases:

+
    +
  1. +

    Software 2.0 use cases: Broadly speaking, this means taking +something that software or a product does in an automated fashion +today and augmenting its automation with machine learning. An +example of this would be improving code completion in the IDE +(like Github +Copilot).

    +
  2. +
  3. +

    Human-in-the-loop systems: Machine learning can be applied for +tasks where automation is not currently deployed - but where +humans could have their judgment or efficiency augmented. Simply +put, helping humans do their jobs better by complementing them +with ML-based tools. An example of this would be turning sketches +into slides, a process will usually involve humans approving the +output of a machine learning model that made the slides.

    +
  4. +
  5. +

    Autonomous systems: Systems that apply machine learning to +augment existing or implement new processes without human input. +An example of this would be full self-driving, where there is no +opportunity for a driver to intervene in the functioning of the +car.

    +
  6. +
+

For each archetype, some key considerations inform how you should go +about planning projects.

+

+
    +
  1. +

    In the case of Software 2.0 projects, you should focus more on +understanding how impactful the performance of the new model +is. Is the model truly much better? How can the performance +continue to increase across iterations?

    +
  2. +
  3. +

    In the case of human-in-the-loop systems, consider more the +context of the human user and what their needs might be. How +good does the system actually have to be to improve the life of a +human reviewing its output? In some cases, a model that does even +10% better with accuracy (nominally a small increase) might have +outsize impacts on human users in the loop.

    +
  4. +
  5. +

    For autonomous systems, focus heavily on the failure rate and its +consequences. When there is no opportunity for human +intervention, as is the case with autonomous systems, failures +need to be carefully monitored to ensure outsize harm doesn't +occur. Self-driving cars are an excellent example of an autonomous +system where failure rates are carefully monitored.

    +
  6. +
+

Data Flywheels

+

As you build a software 2.0 project, strongly consider the concept of +the data flywheel. For certain ML projects, as you improve your +model, your product will get better and more users will engage with the +product, thereby generating more data for the model to get even better. +It's a classic virtuous cycle and truly the gold standard for ML +projects.

+

+

As you consider implementing data flywheels, remember to know the answer +to these three questions:

+
    +
  1. +

    Do you have a data loop? To build a data flywheel, you crucially +need to be able to get labeled data from users in a scalable +fashion. This helps increase access to high-quality data and +define a data loop.

    +
  2. +
  3. +

    Can you turn more data into a better model? This somewhat falls +onto you as the modeling expert, but it may also not be the case +that more data leads to significantly better performance. Make +sure you can actually translate data scale into better model +performance.

    +
  4. +
  5. +

    Does better model performance lead to better product use? You +need to verify that improvements with models are actually tied to +users enjoying the product more and benefiting from it!

    +
  6. +
+

Impact and Feasibility of ML Product Archetypes

+

Let's visit our impact vs. feasibility matrix. Our three product +archetypes differ across the spectrum.

+

+

This is a pretty intuitive evaluation you can apply to all your ML +projects: If it's harder to build (like autonomous systems), it's +likely to have a greater impact! There are ways, however, to change +this matrix in the context of specific projects.

+
    +
  1. +

    For Software 2.0, data flywheels can magnify impact by allowing +models to get much better and increase customer delight over time.

    +
  2. +
  3. +

    For human-in-the-loop systems, you can increase feasibility by +leveraging good product design. Thoughtful design can help reduce +expectations and accuracy requirements. Alternatively, a "good +enough" mindset that prioritizes incremental delivery over time +can make such systems more feasible.

    +
  4. +
  5. +

    For autonomous systems, leveraging humans in the loop can make +development more feasible by adding guardrails and reducing the +potential impact of failures.

    +
  6. +
+

Just Get Started!

+

With all this discussion about archetypes and impact matrices, don't +forget the most important component of engineering: actually +building! Dive in and get started. Start solving problems and iterate +on solutions.

+

One common area practitioners trip up in is tool fetishization. As +MLOps and production ML have flourished, so too has the number of tools +and platforms that address various aspects of the ML process. You don't +need to be perfect with your tooling before driving value from machine +learning. Just because Google and Uber are doing things in a very +structured, at-scale way doesn't mean you need to as well!

+

In this course, we will primarily focus on how to set things up the +right way to do machine learning in production without overcomplicating +it. This is an ML products-focused class, not an MLOps class! Check out +this talk by Jacopo Tagliabue describing MLOps at Reasonable +Scale for a +great exposition of this mindset.

+

Chapter Summary

+
    +
  1. +

    ML adds complexity. Consider whether you really need it.

    +
  2. +
  3. +

    Make sure what you're working on is high impact, or else it might +get killed.

    +
  4. +
+

3 - Lifecycle

+

ML adds complexity to projects and isn't always a value driver. Once you +know, however, that it's the right addition to your project, what does +the actual lifecycle look like? What steps do we embark upon as we +execute?

+

In this course, the common running example we use is of a pose +estimation problem. We'll use this as a case study to demonstrate the +lifecycle and illustrate various points about ML-powered products.

+

+

Here's a graphic that visualizes the lifecycle of ML projects:

+

+

It provides a very helpful structure. Watch from 48:00 to 54:00 to dive +deeper into how this lifecycle occurs in the context of a real machine +learning problem around pose estimation that Josh worked on at OpenAI.

+

Let's comment on some specific nuances:

+
    +
  • +

    Machine learning projects tend to be very iterative. Each of +these phases can feed back into any of the phases that go before +it, as you learn more about the problem that you're working on.

    +
      +
    • +

      For example, you might realize that "Actually, it's way too +hard for us to get data in order to solve this problem!" or +"It's really difficult for us to label the pose of these +objects in 3D space".

      +
    • +
    • +

      A solution might actually be to go back a step in the lifecycle +and set up the problem differently. For example, what if it +were cheaper to annotate per pixel?

      +
    • +
    • +

      This could repeat itself multiple times as you progress through +a project. It's a normal and expected part of the machine +learning product development process.

      +
    • +
    +
  • +
  • +

    In addition to iteration during execution, there's also +cross-project "platform" work that matters! Hiring and +infrastructure development are crucial to the long-term health of +your project.

    +
  • +
  • +

    Going through this lifecycle and winning each step is what we'll +cover in this class!

    +
  • +
+

Lecture Summary

+

In summary, here's what we covered in this lecture:

+
    +
  1. +

    ML is NOT a cure-all. It's a complex technology that needs to be +used thoughtfully.

    +
  2. +
  3. +

    You DON'T need a perfect setup to get going. Start building and +iterate!

    +
  4. +
  5. +

    The lifecycle of machine learning is purposefully iterative and +circuitous. We'll learn how to master this process together!

    +
  6. +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image1.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image1.png new file mode 100644 index 00000000..e465b2c1 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image1.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image10.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image10.png new file mode 100644 index 00000000..a71acfc2 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image10.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image11.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image11.png new file mode 100644 index 00000000..ba524b40 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image11.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image12.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image12.png new file mode 100644 index 00000000..7a83b638 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image12.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image13.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image13.png new file mode 100644 index 00000000..2b24109a Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image13.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image2.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image2.png new file mode 100644 index 00000000..3676a9ac Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image2.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image3.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image3.png new file mode 100644 index 00000000..73a001f3 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image3.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image4.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image4.png new file mode 100644 index 00000000..44227e7b Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image4.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image5.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image5.png new file mode 100644 index 00000000..b4c816d5 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image5.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image6.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image6.png new file mode 100644 index 00000000..db9e2bcc Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image6.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image7.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image7.png new file mode 100644 index 00000000..ee744a8c Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image7.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image8.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image8.png new file mode 100644 index 00000000..b12b7fd7 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image8.png differ diff --git a/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image9.png b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image9.png new file mode 100644 index 00000000..e68f83c7 Binary files /dev/null and b/course/2022/lecture-1-course-vision-and-when-to-use-ml/media/image9.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/index.html b/course/2022/lecture-2-development-infrastructure-and-tooling/index.html new file mode 100644 index 00000000..3c265dd8 --- /dev/null +++ b/course/2022/lecture-2-development-infrastructure-and-tooling/index.html @@ -0,0 +1,3159 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 2: Development Infrastructure & Tooling - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 2: Development Infrastructure & Tooling

+
+ +
+ +

Lecture by Sergey Karayev. +Notes by James Le and Vishnu Rachakonda.
+Published August 15, 2022. +Download slides.

+

1 - Introduction

+

The dream of ML development is that given a project spec and some +sample data, you get a continually improving prediction system deployed +at scale.

+

The reality is starkly different:

+
    +
  • +

    You have to collect, aggregate, process, clean, label, and version +the data.

    +
  • +
  • +

    You have to find the model architecture and their pre-trained +weights and then write and debug the model code.

    +
  • +
  • +

    You run training experiments and review the results, which will be +fed back into the process of trying out new architectures and +debugging more code.

    +
  • +
  • +

    You can now deploy the model.

    +
  • +
  • +

    After model deployment, you have to monitor model predictions and +close the data flywheel loop. Basically, your users generate fresh +data for you, which needs to be added to the training set.

    +
  • +
+

+

This reality has roughly three components: data, development, and +deployment. The tooling infrastructure landscape for them is large, so +we'll have three lectures to cover it all. This lecture focuses on the +development component.

+

2 - Software Engineering

+

+

Language

+

For your choice of programming language, Python is the clear winner +in scientific and data computing because of all the libraries that have +been developed. There have been some contenders like Julia and C/C++, +but Python has really won out.

+

Editors

+

To write Python code, you need an editor. You have many options, +such as Vim, Emacs, Jupyter Notebook/Lab, VS Code, PyCharm, etc.

+
    +
  • +

    We recommend VS Code +because of its nice features such as built-in git version control, +documentation peeking, remote projects opening, linters and type +hints to catch bugs, etc.

    +
  • +
  • +

    Many practitioners develop in Jupyter +Notebooks, which is great as +the "first draft" of a data science project. You have to put in +little thought before you start coding and seeing the immediate +output. However, notebooks have a variety of problems: primitive +editor, out-of-order execution artifacts, and challenges to +version and test them. A counterpoint to these problems is the +nbdev package that lets +you write and test code all in one notebook environment.

    +
  • +
  • +

    We recommend you use VS Code with built-in support for +notebooks - where you can write code in modules imported into +notebooks. It also enables awesome debugging.

    +
  • +
+

If you want to build something more interactive, +Streamlit is an excellent choice. +It lets you decorate Python code, get interactive applets, and publish +them on the web to share with the world.

+

+

For setting up the Python environment, we recommend you see how we did +it in the +lab.

+

3 - Deep Learning Frameworks

+

+

Deep learning is not a lot of code with a matrix math library like +Numpy. But when you have to deploy your code onto CUDA for GPU-powered +deep learning, you want to consider deep learning frameworks as you +might be writing weird layer types, optimizers, data interfaces, etc.

+

Frameworks

+

There are various frameworks, such as PyTorch, TensorFlow, and Jax. They +are all similar in that you first define your model by running Python +code and then collect an optimized execution graph for different +deployment patterns (CPU, GPU, TPU, mobile).

+
    +
  1. +

    We prefer PyTorch because it is absolutely +dominant +by measures such as the number of models, the number of papers, +and the number of competition winners. For instance, about 77% +of 2021 ML competition winners used +PyTorch.

    +
  2. +
  3. +

    With TensorFlow, you have TensorFlow.js (that lets you run deep +learning models in your browser) and Keras (an unmatched developer +experience for easy model development).

    +
  4. +
  5. +

    Jax is a meta-framework for deep learning.

    +
  6. +
+

+

PyTorch has excellent developer +experience and is production-ready and even faster with TorchScript. +There is a great distributed training ecosystem. There are libraries for +vision, audio, etc. There are also mobile deployment targets.

+

PyTorch Lightning +provides a nice structure for organizing your training code, optimizer +code, evaluation code, data loaders, etc. With that structure, you can +run your code on any hardware. There are nice features such as +performance and bottleneck profiler, model checkpointing, 16-bit +precision, and distributed training libraries.

+

Another possibility is FastAI +software, which is developed +alongside the fast.ai course. It provides many advanced tricks such as +data augmentations, better initializations, learning rate schedulers, +etc. It has a modular structure with low-level API, mid-level API, +high-level API, and specific applications. The main problem with FastAI +is that its code style is quite different from mainstream Python.

+

At FSDL, we prefer PyTorch because of its strong ecosystem, but +TensorFlow is still +perfectly good. If you have a specific reason to prefer it, you are +still going to have a good time.

+

Jax is a more recent +project from Google that is not specific to deep learning. It provides +general vectorization, auto-differentiation, and compilation to GPU/TPU +code. For deep learning, there are separate frameworks like +Flax and +Haiku. You should +only use Jax for a specific need.

+

Meta-Frameworks and Model Zoos

+

Most of the time, you will start with at least a model architecture that +someone has developed or published. You will use a specific architecture +(trained on specific data with pre-trained weights) on a model hub.

+
    +
  • +

    ONNX is an open standard for +saving deep learning models and lets you convert from one type of +format to another. It can work well but can also run into some +edge cases.

    +
  • +
  • +

    HuggingFace has become an +absolutely stellar repository of models. It started with NLP tasks +but has then expanded into all kinds of tasks (audio +classification, image classification, object detection, etc.). +There are 60,000 pre-trained models for all these tasks. There is +a Transformers library that works with PyTorch, TensorFlow, and +Jax. There are 7,500 datasets uploaded by people. There's also a +community aspect to it with a Q&A forum.

    +
  • +
  • +

    TIMM +is a collection of state-of-the-art computer vision models and +related code that looks cool.

    +
  • +
+

4 - Distributed Training

+

+

Let's say we have multiple machines represented by little squares above +(with multiple GPUs in each machine). You are sending batches of data to +be processed by a model with parameters. The data batch can fit on a +single GPU or not. The model parameters can fit on a single GPU or not.

+

The best case is that both your data batch and model parameters fit on a +single GPU. That's called trivial parallelism. You can either launch +more independent experiments on other GPUs/machines or increase the +batch size until it no longer fits on one GPU.

+

Data Parallelism

+

If your model still fits on a single GPU, but your data no longer does, +you have to try out data parallelism - which lets you distribute a +single batch of data across GPUs and average gradients that are computed +by the model across GPUs. A lot of model development work is cross-GPU, +so you want to ensure that GPUs have fast interconnects.

+

If you are using a server card, expect a linear +speedup +in training time. If you are using a consumer card, expect a sublinear +speedup +instead.

+

Data parallelism is implemented in PyTorch with the robust +DistributedDataParallel +library. +Horovod is another +3rd-party library option. PyTorch Lightning makes it dead simple to use +either of these two libraries - where speedup seems to be the +same.

+

A more advanced scenario is that you can't even fit your model on a +single GPU. You have to spread the model over multiple GPUs. There are +three solutions to this.

+

Sharded Data-Parallelism

+

Sharded data parallelism starts with the question: What exactly takes up +GPU memory?

+
    +
  • +

    The model parameters include the floats that make up our model +layers.

    +
  • +
  • +

    The gradients are needed to do back-propagation.

    +
  • +
  • +

    The optimizer states include statistics about the gradients

    +
  • +
  • +

    Finally, you have to send a batch of data for model development.

    +
  • +
+

+

Sharding is a concept from databases where if you have one source of +data, you actually break it into shards of data that live across your +distributed system. Microsoft implemented an approach called +ZeRO that shards +the optimizer states, the gradients, and the model parameters. This +results in an insane order of magnitude reduction in memory use, which +means your batch size can be 10x bigger. You should watch the video +in this +article +to see how model parameters are passed around GPUs as computation +proceeds.

+

Sharded data-parallelism is implemented by Microsoft's +DeepSpeed +library and Facebook's +FairScale +library, as well as natively by PyTorch. In PyTorch, it's called +Fully-Sharded +DataParallel. +With PyTorch Lightning, you can try it for a massive memory reduction +without changing the model code.

+

This same ZeRO principle can also be applied to a single GPU. You can +train a 13B-parameter model on a single V100 (32GB) GPU. Fairscale +implements this (called +CPU-offloading).

+

Pipelined Model-Parallelism

+

Model parallelism means that you can put each layer of your model on +each GPU. It is trivial to implement natively but results in only one +GPU being active at a time. Libraries like DeepSpeed and FairScale make +it better by pipelining computation so that the GPUs are fully utilized. +You need to tune the amount of pipelining on the batch size to the exact +degree of how you will split up the model on the GPU.

+

Tensor-Parallelism

+

Tensor parallelism is another approach, which observes that there is +nothing special about matrix multiplication that requires the whole +matrix to be on one GPU. You can distribute the matrix over multiple +GPUs. NVIDIA published the Megatron-LM +repo, which does +this for the Transformer model.

+

You can actually use all of the three techniques mentioned above if you +really want to scale a huge model (like a GPT-3 sized language model). +Read this article on the technology behind BLOOM +training +for a taste.

+

+

In conclusion:

+
    +
  • +

    If your model and data fit on one GPU, that's awesome.

    +
  • +
  • +

    If they do not, and you want to speed up training, try +DistributedDataParallel.

    +
  • +
  • +

    If the model still doesn't fit, try ZeRO-3 or Full-Sharded Data +Parallel.

    +
  • +
+

For more resources to speed up model training, look at this list +compiled by DeepSpeed, +MosaicML, and +FFCV.

+

5 - Compute

+

+

Compute is the next essential ingredient to developing machine +learning models and products.

+

The compute-intensiveness of models has grown tremendously over the last +ten years, as the below charts from +OpenAI and +HuggingFace +show.

+

+

Recent developments, including models like +GPT-3, have +accelerated this trend. These models are extremely large and require a +large number of petaflops to train.

+

GPUs

+

To effectively train deep learning models, GPUs are required. +NVIDIA has been the superior choice for GPU vendors, though Google has +introduced TPUs (Tensor Processing Units) that are effective but are +only available via Google Cloud. There are three primary considerations +when choosing GPUs:

+
    +
  1. +

    How much data fits on the GPU?

    +
  2. +
  3. +

    How fast can the GPU crunch through data? To evaluate this, is your +data 16-bit or 32-bit? The latter is more resource intensive.

    +
  4. +
  5. +

    How fast can you communicate between the CPU and the GPU and between +GPUs?

    +
  6. +
+

Looking at recent NVIDIA GPUs, it becomes clear that a new +high-performing architecture is introduced every few years. There's a +difference between these chips, which are licensed for personal use as +opposed to corporate use; businesses should only use server +cards.

+

+

Two key factors in evaluating GPUs are RAM and Tensor TFlops. +The more RAM, the better the GPU contains large models and datasets. +Tensor TFlops are special tensor cores that NVIDIA includes specifically +for deep learning operations and can handle more intensive +mixed-precision operations. A tip: leveraging 16-bit training can +effectively double your RAM capacity!

+

While these theoretical benchmarks are useful, how do GPUs perform +practically? Lambda Labs offers the best benchmarks +here. Their results +show that the most recent server-grade NVIDIA GPU (A100) is more than +2.5 times faster than the classic V100 GPU. RTX chips also outperform +the V100. AIME is also another source of GPU +benchmarks.

+

Cloud services such as Microsoft Azure, Google Cloud Platform, and +Amazon Web Services are the default place to buy access to GPUs. Startup +cloud providers like +Paperspace, +CoreWeave, and Lambda +Labs also offer such services.

+

TPUs

+

Let's briefly discuss TPUs. There are four generations of TPUs, and the +most recent v4 is the fastest possible accelerator for deep learning. V4 +TPUs are not generally available yet, but TPUs generally excel at +scaling to larger and model sizes. The below charts compare TPUs to +the fastest A100 NVIDIA chip.

+

+

It can be overwhelming to compare the cost of cloud access to GPUs, so +we made a tool that solves this +problem! +Feel free to contribute to our repository of Cloud GPU cost +metrics. +The tool has all kinds of nifty features like enabling filters for only +the most recent chip models, etc.

+

If we combine the cost metrics with performance +metrics, +we find that the most expensive per hour chips are not the most +expensive per experiment! Case in point: running the same Transformers +experiment on 4 V100s costs $1750 over 72 hours, whereas the same +experiment on 4 A100s costs $250 over only 8 hours. Think carefully +about cost and performance based on the model you're trying to train.

+

Some helpful heuristics here are:

+
    +
  1. +

    Use the most expensive per-hour GPU in the least expensive cloud.

    +
  2. +
  3. +

    Startups (e.g., Paperspace) tend to be cheaper than major cloud +providers.

    +
  4. +
+

On-Prem vs. Cloud

+

For on-prem use cases, you can build your own pretty easily or opt +for a pre-built computer from a company like NVIDIA. You can build a +good, quiet PC with 128 GB RAM and 2 RTX 3090s for about $7000 and set +it up in a day. Going beyond this can start to get far more expensive +and complicated. Lambda Labs offers a $60,000 machine with 8 A100s +(super fast!). Tim Dettmers offers a great (slightly outdated) +perspective on building a machine +here.

+

Some tips on on-prem vs. cloud use:

+
    +
  • +

    It can be useful to have your own GPU machine to shift your mindset +from minimizing cost to maximizing utility.

    +
  • +
  • +

    To truly scale-out experiments, you should probably just use the +most expensive machines in the least expensive cloud.

    +
  • +
  • +

    TPUs are worth experimenting with for large-scale training, given +their performance.

    +
  • +
  • +

    Lambda Labs is a sponsor, and we highly encourage looking at them +for on-prem and cloud GPU use!

    +
  • +
+

6 - Resource Management

+

+

Now that we've talked about raw compute, let's talk about options for +how to manage our compute resources. Let's say we want to manage a +set of experiments. Broadly speaking, we'll need hardware in the form of +GPUs, software requirements (e.g., PyTorch version), and data to train +on.

+

Solutions

+

Leveraging best practices for specifying dependencies (e.g., Poetry, +conda, pip-tools) makes the process of spinning up such experiments +quick and easy on a single machine.

+

If, however, you have a cluster of machines to run experiments on, +SLURM is +the tried and true solution for workload management that is still widely +used.

+

For more portability, Docker is +a way to package up an entire dependency stack into a lighter-than-a-VM +package. Kubernetes is the most +popular way to run many Docker containers on top of a cluster. The OSS +Kubeflow project helps manage +ML projects that rely on Kubernetes.

+

These projects are useful, but they may not be the easiest or best +choice. They're great if you already have a cluster up and running, but +how do you actually set up a cluster or compute platform?

+

Before proceeding, FSDL prefers open source and/or transparently priced +products. We discuss tools that fall into these categories, not SaaS +with opaque pricing.

+

Tools

+

For practitioners all in on AWS, AWS +Sagemaker offers a +convenient end-to-end solution for building machine learning models, +from labeling data to deploying models. Sagemaker has a ton of +AWS-specific configuration, which can be a turnoff, but it brings a lot +of easy-to-use old school algorithms for training and allows you to BYO +algorithms as well. They're also increasing support for PyTorch, though +the markup for PyTorch is about 15-20% more expensive.

+

Anyscale is a company created +by the makers of the Berkeley OSS project +Ray. Anyscale +recently launched Ray +Train, +which they claim is faster than Sagemaker with a similar value +proposition. Anyscale makes it really easy to provision a compute +cluster, but it's considerably more expensive than alternatives.

+

Grid.ai is created by the PyTorch +Lightning creators. Grid allows you to specify what compute parameters +to use easily with "grid run" followed by the types of compute and +options you want. You can use their instances or AWS under the hood. +Grid has an uncertain future, as its future compatibility with Lightning +(given their rebrand) has not been clarified.

+

There are several non-ML options for spinning up compute too! Writing +your own scripts, using various libraries, or even Kubernetes are all +options. This route is harder.

+

Determined.AI is an OSS solution +for managing on-prem and cloud clusters. They offer cluster management, +distributed training, and more. It's pretty easy to use and is in active +development.

+

With all this said, there is still room to improve the ease of +experience for launching training on many cloud providers.

+

7 - Experiment and Model Management

+

+

In contrast to compute, experiment management is quite close to being +solved. Experiment management refers to tools and processes that help +us keep track of code, model parameters, and data sets that are iterated +on during the model development lifecycle. Such tools are essential to +effective model development. There are several solutions here:

+
    +
  • +

    TensorBoard: +A non-exclusive Google solution effective at one-off experiment +tracking. It is difficult to manage many experiments.

    +
  • +
  • +

    MLflow: A non-exclusive +Databricks project that includes model packaging and more, in +addition to experiment management. It must be self-hosted.

    +
  • +
  • +

    Weights and Biases: An +easy-to-use solution that is free for personal and academic projects! Logging +starts simply with an "experiment config" command.

    +
  • +
  • +

    Other options include Neptune +AI, Comet +ML, and Determined +AI, all of which have solid +experiment tracking options.

    +
  • +
+

Many of these platforms also offer intelligent hyperparameter +optimization, which allows us to control the cost of searching for the +right parameters for a model. For example, Weights and Biases has a +product called Sweeps that +helps with hyperparameter optimization. It's best to have it as part of +your regular ML training tool; there's no need for a dedicated tool.

+

8 - "All-In-One"

+

+

There are machine learning infrastructure solutions that offer +everything--training, experiment tracking, scaling out, deployment, +etc. These "all-in-one" platforms simplify things but don't come cheap! +Examples include Gradient by +Paperspace, Domino +Data Lab, AWS +Sagemaker, etc.

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image1.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image1.png new file mode 100644 index 00000000..07e883b2 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image1.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image10.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image10.png new file mode 100644 index 00000000..34038689 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image10.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image11.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image11.png new file mode 100644 index 00000000..9f23e953 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image11.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image12.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image12.png new file mode 100644 index 00000000..4bfa1aac Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image12.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image13.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image13.png new file mode 100644 index 00000000..1f61ae44 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image13.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image14.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image14.png new file mode 100644 index 00000000..1640f1d2 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image14.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image15.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image15.png new file mode 100644 index 00000000..52aa4c01 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image15.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image2.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image2.png new file mode 100644 index 00000000..cee99d1e Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image2.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image3.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image3.png new file mode 100644 index 00000000..14352db9 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image3.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image4.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image4.png new file mode 100644 index 00000000..3d3e8081 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image4.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image5.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image5.png new file mode 100644 index 00000000..1baf5a8f Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image5.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image6.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image6.png new file mode 100644 index 00000000..da971db5 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image6.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image7.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image7.png new file mode 100644 index 00000000..b4eee0de Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image7.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image8.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image8.png new file mode 100644 index 00000000..298222e4 Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image8.png differ diff --git a/course/2022/lecture-2-development-infrastructure-and-tooling/media/image9.png b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image9.png new file mode 100644 index 00000000..bda0f03b Binary files /dev/null and b/course/2022/lecture-2-development-infrastructure-and-tooling/media/image9.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/index.html b/course/2022/lecture-3-troubleshooting-and-testing/index.html new file mode 100644 index 00000000..26cbeff0 --- /dev/null +++ b/course/2022/lecture-3-troubleshooting-and-testing/index.html @@ -0,0 +1,3203 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 3: Troubleshooting & Testing - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 3: Troubleshooting & Testing

+
+ +
+ +

Lecture by Charles Frye.
+Notes by James Le and Vishnu Rachakonda.
+Published August 22, 2022. +Download slides.

+

1 - Testing Software

+
    +
  1. +

    The general approach is that tests will help us ship faster with +fewer bugs, but they won't catch all of our bugs.

    +
  2. +
  3. +

    That means we will use testing tools but won't try to achieve 100% +coverage.

    +
  4. +
  5. +

    Similarly, we will use linting tools to improve the development +experience but leave escape valves rather than pedantically +following our style guides.

    +
  6. +
  7. +

    Finally, we'll discuss tools for automating these workflows.

    +
  8. +
+

1.1 - Tests Help Us Ship Faster. They Don't Catch All Bugs

+

+

Tests are code we write that are designed to fail intelligibly when +our other code has bugs. These tests can help catch some bugs before +they are merged into the main product, but they can't catch all bugs. +The main reason is that test suites are not certificates of correctness. +In some formal systems, tests can be proof of code correctness. But we +are writing in Python (a loosely goosey language), so all bets are off +in terms of code correctness.

+

Nelson Elhage +framed test suites more like classifiers. The classification problem is: +does this commit have a bug, or is it okay? The classifier output is +whether the tests pass or fail. We can then treat test suites as a +"prediction" of whether there is a bug, which suggests a different way +of designing our test suites.

+

When designing classifiers, we need to trade off detection and false +alarms. If we try to catch all possible bugs, we can inadvertently +introduce false alarms. The classic signature of a false alarm is a +failed test - followed by a commit that fixes the test rather than the +code.

+

To avoid introducing too many false alarms, it's useful to ask yourself +two questions before adding a test:

+
    +
  1. +

    Which real bugs will this test catch?

    +
  2. +
  3. +

    Which false alarms will this test raise?

    +
  4. +
+

If you can think of more examples for the second question than the first +one, maybe you should reconsider whether you need this test.

+

One caveat is that: in some settings, correctness is important. +Examples include medical diagnostics/intervention, self-driving +vehicles, and banking/finance. A pattern immediately arises here: If you +are operating in a high-stakes situation where errors have consequences +for people's lives and livelihoods, even if it's not regulated yet, it +might be regulated soon. These are examples of low-feasibility, +high-impact ML projects discussed in the first lecture.

+

+

1.2 - Use Testing Tools, But Don't Chase Coverage

+
    +
  • +

    Pytest is the standard +tool for testing Python code. It has a Pythonic implementation and +powerful features such as creating separate suites, sharing +resources across tests, and running parametrized variations of +tests.

    +
  • +
  • +

    Pure text docs can't be checked for correctness automatically, so +they are hard to maintain or trust. Python has a nice module, +[doctests], +for checking code in the documentation and preventing rot.

    +
  • +
  • +

    Notebooks help connect rich media (charts, images, and web pages) +with code execution. A cheap and dirty solution to test notebooks +is adding some asserts and using nbformat to run the +notebooks.

    +
  • +
+

+

Once you start adding different types of tests and your codebase grows, +you will want coverage tools for recording which code is checked or +"covered" by tests. Typically, this is done in lines of code, but some +tools can be more fine-grained. We recommend +Codecov, which generates nice +visualizations you can use to drill down and get a high-level overview +of the current state of your testing. Codecov helps you understand your +tests and can be incorporated into your testing. You can say you want to +reject commits not only where tests fail, but also where test coverage +goes down below a certain threshold.

+

However, we recommend against that. Personal experience, interviews, and +published research suggest that only a small fraction of the tests you +write will generate most of your value. The right tactic, +engineering-wise, is to expand the limited engineering effort we have on +the highest-impact tests and ensure that those are super high quality. +If you set a coverage target, you will instead write tests in order to +meet that coverage target (regardless of their quality). You end up +spending more effort to write tests and deal with their low quality.

+

+

1.3 - Use Linting Tools, But Leave Escape Valves

+

Clean code is of uniform and standard style.

+
    +
  1. +

    Uniform style helps avoid spending engineering time on arguments +over style in pull requests and code review. It also helps improve +the utility of our version control by cutting down on noisy +components of diffs and reducing their size. Both benefits make it +easier for humans to visually parse the diffs in our version +control system and make it easier to build automation around them.

    +
  2. +
  3. +

    Standard style makes it easier to accept contributions for an +open-source repository and onboard new team members for a +closed-source system.

    +
  4. +
+

+

One aspect of consistent style is consistent code formatting (with +things like whitespace). The standard tool for that in Python is +[the] [black] Python +formatter. It's a very +opinionated tool with a fairly narrow scope in terms of style. It +focuses on things that can be fully automated and can be nicely +integrated into your editor and automated workflows.

+

For non-automatable aspects of style (like missing docstrings), we +recommend [flake8]. It comes +with many extensions and plugins such as docstring completeness, type +hinting, security, and common bugs.

+

ML codebases often have both Python code and shell scripts in them. +Shell scripts are powerful, but they also have a lot of sharp edges. +shellcheck knows all the +weird behaviors of bash that often cause errors and issues that aren't +immediately obvious. It also provides explanations for why it's raising +a warning or an error. It's very fast to run and can be easily +incorporated into your editor.

+

+

One caveat to this is: pedantic enforcement of style is obnoxious. +To avoid frustration with code style and linting, we recommend:

+
    +
  1. +

    Filtering rules down to the minimal style that achieves the goals we +set out (sticking with standards, avoiding arguments, keeping +version control history clean, etc.)

    +
  2. +
  3. +

    Having an "opt-in" application of rules and gradually growing +coverage over time - which is especially important for existing +codebases (which may have thousands of lines of code that we need +to be fixed).

    +
  4. +
+

1.4 - Always Be Automating

+

To make the best use of testing and linting practices, you want to +automate these tasks and connect to your cloud version control system +(VCS). Connecting to the VCS state reduces friction when trying to +reproduce or understand errors. Furthermore, running things outside of +developer environments means that you can run tests automatically in +parallel to other development work.

+

Popular, open-source repositories are the best place to learn about +automation best practices. For instance, the PyTorch Github library has +tons of automated workflows built into the repo - such as workflows that +automatically run on every push and pull.

+

+

The tool that PyTorch uses (and that we recommend) is GitHub +Actions, which ties +automation directly to VCS. It is powerful, flexible, performant, and +easy to use. It gets great documentation, can be used with a YAML file, +and is embraced by the open-source community. There are other options +such as pre-commit.ci, +CircleCI, and +Jenkins; but GitHub Actions +seems to have won the hearts and minds in the open-source community in +the last few years.

+

To keep your version control history as clean as possible, you want to +be able to run tests and linters locally before committing. We recommend +pre-commit +to enforce hygiene checks. You can use it to run formatting, linting, +etc. on every commit and keep the total runtime to a few seconds. +pre-commit is easy to run locally and easy to automate with GitHub +Actions.

+

Automation to ensure the quality and integrity of our software is a +productivity enhancer. That's broader than just CI/CD. Automation +helps you avoid context switching, surfaces issues early, is a force +multiplier for small teams, and is better documented by default.

+

One caveat is that: automation requires really knowing your tools. +Knowing Docker well enough to use it is not the same as knowing Docker +well enough to automate it. Bad automation, like bad tests, takes more +time than it saves. Organizationally, that makes automation a good task +for senior engineers who have knowledge of these tools, have ownership +over code, and can make these decisions around automation.

+

Summary

+
    +
  1. +

    Automate tasks with GitHub Actions to reduce friction.

    +
  2. +
  3. +

    Use the standard Python toolkit for testing and cleaning your +projects.

    +
  4. +
  5. +

    Choose testing and linting practices with the 80/20 principle, +shipping velocity, and usability/developer experience in mind.

    +
  6. +
+

2 - Testing ML Systems

+
    +
  1. +

    Testing ML is hard, but not impossible.

    +
  2. +
  3. +

    We should stick with the low-hanging fruit to start.

    +
  4. +
  5. +

    Test your code in production, but don't release bad code.

    +
  6. +
+

2.1 - Testing ML Is Hard, But Not Impossible

+

Software engineering is where many testing practices have been +developed. In software engineering, we compile source code into +programs. In machine learning, training compiles data into a model. +These components are harder to test:

+
    +
  1. +

    Data is heavier and more inscrutable than source code.

    +
  2. +
  3. +

    Training is more complex and less well-defined.

    +
  4. +
  5. +

    Models have worse tools for debugging and inspection than compiled +programs.

    +
  6. +
+

In this section, we will focus primarily on "smoke" tests. These tests +are easy to implement and still effective. They are among the 20% of +tests that get us 80% of the value.

+

2.2 - Use Expectation Testing on Data

+

We test our data by checking basic properties. We express our +expectations about the data, which might be things like there are no +nulls in this column or the completion date is after the start date. +With expectation testing, you will start small with only a few +properties and grow them slowly. You only want to test things that are +worth raising alarms and sending notifications to others.

+

+

We recommend +[great_expectations] for +data testing. It automatically generates documentation and quality +reports for your data, in addition to built-in logging and alerting +designed for expectation testing. To get started, check out this +MadeWithML tutorial on +great_expectations.

+

+

To move forward, you want to stay as close to the data as possible:

+
    +
  1. +

    A common pattern is that there's a benchmark dataset with +annotations (in academia) or an external annotation team (in the +industry). A lot of the detailed information about that data can +be extracted by simply looking at it.

    +
  2. +
  3. +

    One way for data to get internalized into the organization is that +at the start of the project, model developers annotate data ad-hoc +(especially if you don't have the budget for an external +annotation team).

    +
  4. +
  5. +

    However, if the model developers at the start of the project move on +and more developers get onboarded, that knowledge is diluted. A +better solution is an internal annotation team that has a regular +information flow with the model developers is a better solution.

    +
  6. +
  7. +

    The best practice (recommended by Shreya +Shankar) +is to have a regular on-call rotation where model developers +annotate data themselves. Ideally, these are fresh data so that +all members of the team who are developing models know about the +data and build intuition/expertise in the data.

    +
  8. +
+

2.3 - Use Memorization Testing on Training

+

Memorization is the simplest form of learning. Deep neural networks +are very good at memorizing data, so checking whether your model can +memorize a very small fraction of the full data set is a great smoke +test for training. If a model can't memorize, then something is clearly +very wrong!

+

Only really gross issues with training will show up with this test. For +example, your gradients may not be calculated correctly, you have a +numerical issue, or your labels have been shuffled; serious issues like +these. Subtle bugs in your model or your data are not going to show up. +A way to catch smaller bugs is to include the length of run time in your +test coverage. It's a good way to detect if smaller issues are making it +harder for your model to learn. If the number of epochs it takes to +reach an expected performance suddenly goes up, it may be due to a +training bug. PyTorch Lightning has an "overfit_batches" feature that +can help with this.

+

Make sure to tune memorization tests to run quickly, so you can +regularly run them. If they are under 10 minutes or some short +threshold, they can be run every PR or code change to better catch +breaking changes. A couple of ideas for speeding up these tests are +below:

+

+

Overall, these ideas lead to memorization tests that implement model +training on different time scale and allow you to mock out scenarios.

+

A solid, if expensive idea for testing training is to rerun old +training jobs with new code. It's not something that can be run +frequently, but doing so can yield lessons about what unexpected changes +might have happened in your training pipeline. The main drawback is the +potential expense of running these tests. CI platforms like +CircleCI charge a great deal for +GPUs, while others like Github Actions don't offer access to the +relevant machines easily.

+

The best option for testing training is to regularly run training with +new data that's coming in from production. This is still expensive, +but it is directly related to improvements in model development, not +just testing for breakages. Setting this up requires a data flywheel +similar to what we talked about in Lecture 1. Further tooling needed to +achieve will be discussed down the line.

+

2.4 - Adapt Regression Testing for Models

+

Models are effectively functions. They have inputs and produce +outputs like any other function in code. So, why not test them like +functions with regression testing? For specific inputs, we can check to +see whether the model consistently returns the same outputs. This is +best done with simpler models like classification models. It's harder to +maintain such tests with more complex models. However, even in a more +complex model scenario, regression testing can be useful for comparing +changes from training to production.

+

+

A more sophisticated approach to testing for ML models is to use loss +values and model metrics to build documented test suites out of your +data. Consider this similar to the test-driven +development +(TDD) code writing paradigm. The test that is written before your code +in TDD is akin to your model's loss performance; both represent the gap +between where your code needs to be and where it is. Over time, as we +improve the loss metric, our model is getting closer to passing "the +test" we've imposed on it. The gradient descent we use to improve the +model can be considered a TDD approach to machine learning models!

+

+

While gradient descent is somewhat like TDD, it's not exactly the same +because simply reviewing metrics doesn't tell us how to resolve model +failures (the way traditional software tests do).

+

To fill in this gap, start by looking at the data points that have +the highest loss. Flag +them for a test suite composed of "hard" examples. Doing this provides +two advantages: it helps find where the model can be improved, and it +can also help find errors in the data itself (i.e. poor labels).

+

As you examine these failures, you can aggregate types of failures into +named suites. For example in a self-driving car use case, you could have +a "night time" suite and a "reflection" suite. Building these test +suites can be considered the machine learning version of regression +testing, where you take bugs that you've observed in production and +add them to your test suite to make sure that they don't come up again.

+

+

The method can be quite manual, but there are some options for speeding +it up. Partnering with the annotation team at your company can help make +developing these tests a lot faster. Another approach is to use a method +called Domino that +uses foundation models to find errors. Additionally, for testing NLP +models, use the +CheckList approach.

+

2.5 - Test in Production, But Don't YOLO

+

It's crucial to test in true production settings. This is especially +true for machine learning models, because data is an important component +of both the production and the development environments. It's difficult +to ensure that both are very close to one another.

+

The best way to solve the training and production difference is to +test in production.

+

Testing in production isn't sufficient on its own. Rather, testing in +production allows us to develop tooling and infrastructure that allows +us to resolve production errors quickly (which are often quite +expensive). It reduces pressure on other kinds of testing, but does not +replace them.

+

+

We will cover in detail the tooling needed for production monitoring and +continual learning of ML systems in a future lecture.

+

2.6 - ML Test Score

+

So far, we have discussed writing "smoke" tests for ML: expectation +tests for data, memorization tests for training, and regression tests +for models.

+

As your code base and team mature, adopt a more full-fledged approach +to testing ML systems like the approach identified in the ML Test +Score paper. The +ML Test Score is a rubric that evolved out of machine learning efforts +at Google. It's a strict rubric for ML test quality that covers data, +models, training, infrastructure, and production monitoring. It overlaps +with, but goes beyond some of the recommendations we've offered.

+

+

It's rather expensive, but worth it for high stakes use cases that need +to be really well-engineered! To be really clear, this rubric is +really strict. Even our Text Recognizer system we've designed so far +misses a few categories. Use the ML Test Score as inspiration to develop +the right testing approach that works for your team's resources and +needs.

+

+

3 - Troubleshooting Models

+

Tests help us figure out something is wrong, but troubleshooting is +required to actually fix broken ML systems. Models often require the +most troubleshooting, and in this section we'll cover a three step +approach to troubleshooting them.

+
    +
  1. +

    "Make it run" by avoiding common errors.

    +
  2. +
  3. +

    "Make it fast" by profiling and removing bottlenecks.

    +
  4. +
  5. +

    "Make it right" by scaling model/data and sticking with proven +architectures.

    +
  6. +
+

3.1 - Make It Run

+

This is the easiest step for models; only a small portion of bugs cause +the kind of loud failures that prevent a model from running at all. +Watch out for these bugs in advance and save yourself the trouble of +models that don't run.

+

The first type of bugs that prevent models from running at all are +shape errors. When the shape of the tensors don't match for the +operations run on them, models can't be trained or run. Prevent these +errors by keeping notes on the expected size of tensors, annotate the +sizes in the code, and even step through your model code with a debugger +to check tensor size as you go.

+

+

The second type of bugs is out of memory errors. This occurs when +you try to push a tensor to a GPU that is too large to fit. PyTorch +Lightning has good tools to prevent this. Make sure you're using the +lowest precision your training can tolerate; a good default is 16 bit +precision. Another common reason for this is trying to run a model on +too much data or too large a batch size. Use the autoscale batch size +feature in PyTorch Lightning to pick the right size batch. You can use +gradient accumulation if these batch sizes get too small. If neither of +these options work, you can look into manual techniques like tensor +parallelism and gradient checkpoints.

+

Numerical errors also cause machine learning failures. This is when +NaNs or infinite values show up in tensors. These issues most commonly +appear first in the gradient and then cascade through the model. PyTorch +Lightning has a good tool for tracking and logging gradient norms. A +good tip to check whether these issues are caused by precision issues is +to switch to Python 64 bit floats and see if that causes these issues to +go away. Normalization layers tend to cause these issues, generally +speaking. So watch out for how you do normalization!

+

3.2 - Make It Fast

+

+

Once you can run a model, you'll want it to run fast. This can be tricky +because the performance of DNN training code is very counterintuitive. +For example, transformers can actually spend more time in the MLP layer +than the attention layer. Similarly, trivial components like loading +data can soak up performance.

+

To solve these issues, the primary solution is to roll up your sleeves +and profile your code. You can often find pretty easy Python changes +that yield big results. Read these two tutorials by +Charles +and Horace for more +details.

+

3.3 - Make It Right

+

After you make it run fast, make the model right. Unlike traditional +software, machine learning models never are truly perfect. Production +performance is never perfect. As such, it might be more appropriate to +say "make it as right as needed".

+

Knowing this, making the model run and run fast allows us to make the +model right through applying scale. To achieve performance benefits, +scaling a model or its data are generally fruitful and achievable +routes. It's a lot easier to scale a fast model. Research from OpenAI +and other institutions +is showing that benefits from scale can be rigorously measured and +predicted across compute budget, dataset size, and parameter count.

+

+

If you can't afford to scale yourself, consider finetuning a model +trained at scale for your task.

+

So far, all of the advice given has been model and task-agnostic. +Anything more detailed has to be specific to the model and the relevant +task. Stick close to working architectures and hyperparameters from +places like HuggingFace, and try not to reinvent the wheel!

+

4 - Resources

+

Here are some helpful resources that discuss this topic.

+

Tweeters

+
    +
  1. +

    Julia Evans

    +
  2. +
  3. +

    Charity Majors

    +
  4. +
  5. +

    Nelson Elhage

    +
  6. +
  7. +

    kipply

    +
  8. +
  9. +

    Horace He

    +
  10. +
  11. +

    Andrej Karpathy

    +
  12. +
  13. +

    Chip Huyen

    +
  14. +
  15. +

    Jeremy Howard

    +
  16. +
  17. +

    Ross Wightman

    +
  18. +
+

Templates

+
    +
  1. +

    Lightning Hydra +Template

    +
  2. +
  3. +

    NN Template

    +
  4. +
  5. +

    Generic Deep Learning Project +Template

    +
  6. +
+

Texts

+
    +
  1. +

    Reliable ML Systems +talk

    +
  2. +
  3. +

    "ML Test Score" +paper

    +
  4. +
  5. +

    "Attack of the Cosmic +Rays!"

    +
  6. +
  7. +

    "Computers can be +understood"

    +
  8. +
  9. +

    "Systems that defy detailed +understanding"

    +
  10. +
  11. +

    Testing section from MadeWithML course on +MLOps

    +
  12. +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image1.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image1.png new file mode 100644 index 00000000..fc117596 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image1.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image10.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image10.png new file mode 100644 index 00000000..3441ca32 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image10.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image11.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image11.png new file mode 100644 index 00000000..e0964ec3 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image11.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image12.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image12.png new file mode 100644 index 00000000..0539b56e Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image12.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image13.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image13.png new file mode 100644 index 00000000..c7ec6a26 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image13.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image14.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image14.png new file mode 100644 index 00000000..859b53f2 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image14.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image15.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image15.png new file mode 100644 index 00000000..9a1b0742 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image15.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image16.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image16.png new file mode 100644 index 00000000..25d30d7c Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image16.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image17.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image17.png new file mode 100644 index 00000000..65e051e5 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image17.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image18.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image18.png new file mode 100644 index 00000000..5b3e3115 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image18.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image19.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image19.png new file mode 100644 index 00000000..ccb8583b Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image19.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image2.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image2.png new file mode 100644 index 00000000..a224f28f Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image2.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image3.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image3.png new file mode 100644 index 00000000..20ac8a10 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image3.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image4.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image4.png new file mode 100644 index 00000000..9fdd71f9 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image4.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image5.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image5.png new file mode 100644 index 00000000..2242748b Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image5.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image6.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image6.png new file mode 100644 index 00000000..8f6a87cd Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image6.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image7.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image7.png new file mode 100644 index 00000000..008cdf79 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image7.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image8.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image8.png new file mode 100644 index 00000000..4a219b5a Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image8.png differ diff --git a/course/2022/lecture-3-troubleshooting-and-testing/media/image9.png b/course/2022/lecture-3-troubleshooting-and-testing/media/image9.png new file mode 100644 index 00000000..318e9549 Binary files /dev/null and b/course/2022/lecture-3-troubleshooting-and-testing/media/image9.png differ diff --git a/course/2022/lecture-4-data-management/index.html b/course/2022/lecture-4-data-management/index.html new file mode 100644 index 00000000..78c230bc --- /dev/null +++ b/course/2022/lecture-4-data-management/index.html @@ -0,0 +1,2865 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 4: Data Management - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 4: Data Management

+
+ +
+ +

Lecture by Sergey Karayev.
+Notes by James Le and Vishnu Rachakonda.
+Published August 29, 2022. +Download slides.

+

1 - Introduction

+

One thing people don't quite get as they enter the field of ML is how +much of it deals with data - putting together datasets, exploring the +data, wrangling the data, etc. The key points of this lecture are:

+
    +
  1. +

    Spend 10x as much time exploring the data as you would like to.

    +
  2. +
  3. +

    Fixing, adding, and augmenting the data is usually the best way to +improve performance.

    +
  4. +
  5. +

    Keep it all simple!

    +
  6. +
+

2 - Data Sources

+

+

There are many possibilities for the sources of data. You might have +images, text files, logs, or database records. In deep learning, you +need to get that data into a local filesystem disk next to a GPU. How +you send data from the sources to training is different for each +project.

+
    +
  • +

    With images, you can simply download them from S3.

    +
  • +
  • +

    With text files, you need to process them in some distributed way, +analyze the data, select a subset, and put that on a local +machine.

    +
  • +
  • +

    With logs and database records, you can use a data lake to aggregate +and process the data.

    +
  • +
+

+

The basics will be the same - a filesystem, object storage, and +databases.

+

Filesystem

+

The filesystem is a fundamental abstraction. Its fundamental unit is +a file - which can be text or binary, is not versioned, and is easily +overwritten. The filesystem is usually on a disk connected to your +machine - physically connected on-prem, attached in the cloud, or even +distributed.

+

The first thing to know about discs is that their speed and bandwidth +range - from hard discs to solid-state discs. There are two orders of +magnitude differences between the slowest (SATA SSD) and the fastest +(NVMe SSD) discs. Below are some latency numbers you should know, with +the human-scale numbers in parentheses:

+

+

What formats should the data be stored on the local disc?

+
    +
  • +

    If you work with binary data like images and audio, just use the +standard formats like JPEG or MP3 that it comes in.

    +
  • +
  • +

    If you work with metadata (like labels), tabular data, or text data, +then compressed JSON or text files are just fine. Alternatively, +Parquet is a table format that is fast, compact, and widely used.

    +
  • +
+

Object Storage

+

The object storage is an API over the filesystem. Its fundamental +unit is an object, usually in a binary format (an image, a sound file, a +text file, etc.). We can build versioning or redundancy into the object +storage service. It is not as fast as the local filesystem, but it can be +fast enough within the cloud.

+

Databases

+

Databases are persistent, fast, and scalable storage and retrieval +of structured data systems. A helpful mental model for this is: all the +data that the databases hold is actually in the computer\'s RAM, but the +database software ensures that if the computer gets turned off, +everything is safely persisted to disk. If too much data is in the RAM, +it scales out to disk in a performant way.

+

You should not store binary data in the database but the object-store +URLs instead. Postgres is +the right choice most of the time. It is an open-source database that +supports unstructured JSON and queries over that JSON. +SQLite is also perfectly good +for small projects.

+

Most coding projects that deal with collections of objects that +reference each other will eventually implement a crappy database. Using +a database from the beginning with likely save you time. In fact, most +MLOps tools are databases at their core (e.g., +W&B is a database of experiments, +HuggingFace Hub is a +database of models, and Label +Studio is a database of labels).

+

+

Data warehouses are stores for online analytical processing (OLAP), +as opposed to databases being the data stores for online transaction +processing (OLTP). You get data into the data warehouse through a +process called ETL (Extract-Transform-Load): Given a number of data +sources, you extract the data, transform it into a uniform schema, and +load it into the data warehouse. From the warehouse, you can run +business intelligence queries. The difference between OLAP and OLTP is +that: OLAPs are column-oriented, while OLTPs are row-oriented.

+

+

Data lakes are unstructured aggregations of data from multiple +sources. The main difference between them and data warehouses is that +data lakes use ELT (Extract-Load-Transform) process: dumping all the +data in and transforming them for specific needs later.

+

The big trend is unifying both data lake and data warehouse, so that +structured data and unstructured data can live together. The two big +platforms for this are +Snowflake and +Databricks. If you are +really into this stuff, "Designing Data-Intensive +Applications" is a great book +that walks through it from first principles.

+

3 - Data Exploration

+

+

To explore the data, you must speak its language, mostly SQL and, +increasingly, DataFrame. SQL is the standard interface for +structured data, which has existed for decades. Pandas is the main +DataFrame in the Python ecosystem that lets you do SQL-like things. Our +advice is to become fluent in both to interact with both transactional +databases and analytical warehouses and lakes.

+

Pandas is the workhorse of +Python data science. You can try DASK +DataFrame to +parallelize Pandas operations over cores and +RAPIDS to do Pandas operations on +GPUs.

+

4 - Data Processing

+

+

Talking about data processing, it's useful to have a motivational +example. Let's say we have to train a photo popularity predictor every +night. For each photo, the training data must include:

+
    +
  1. +

    Metadata (such as posting time, title, and location) that sits in +the database.

    +
  2. +
  3. +

    Some features of the user (such as how many times they logged in +today) that are needed to be computed from logs.

    +
  4. +
  5. +

    Outputs of photo classifiers (such as content and style) that are +needed to run the classifiers.

    +
  6. +
+

Our ultimate task is to train the photo predictor model, but we need to +output data from the database, compute the logs, and run classifiers to +output their predictions. As a result, we have task dependencies. +Some tasks can't start until others are finished, so finishing a task +should kick off its dependencies.

+

Ideally, dependencies are not always files but also programs and +databases. We should be able to spread this work over many machines and +execute many dependency graphs all at once.

+

+
    +
  • +

    Airflow is a standard +scheduler for Python, where it's possible to specify the DAG +(directed acyclic graph) of tasks using Python code. The operator +in that graph can be SQL operations or Python functions.

    +
  • +
  • +

    To distribute these jobs, the workflow manager has a queue for the +tasks and manages the workers that pull from them. It will restart +jobs if they fail and ping you when the jobs are finished.

    +
  • +
  • +

    Prefect and +Dagster are contenders to +improve and replace Airflow in the long run.

    +
  • +
+

The primary advice here is not to over-engineer things. You can get +machines with many CPU cores and a lot of RAM nowadays. For example, +UNIX has powerful parallelism, streaming, and highly optimized tools.

+

5 - Feature Store

+

+

Let's say your data processing generates artifacts you need for +training. How do you make sure that, in production, the trained model +sees the same processing taking place (which happened during training)? +How do you avoid recomputation during retraining?

+

Feature stores are a solution to this (that you may not need!).

+ +

6 - Datasets

+

+

What about datasets specifically made for machine learning?

+

HuggingFace +Datasets is a great +source of machine learning-ready data. There are 8000+ datasets covering +a wide variety of tasks, like computer vision, NLP, etc. The Github-Code +dataset on HuggingFace is a good example of how these datasets are +well-suited for ML applications. Github-Code can be streamed, is in the +modern Apache Parquet format, and doesn't require you to download 1TB+ +of data in order to properly work with it. Another sample dataset is +RedCaps, which consists of 12M image-text pairs from Reddit.

+

+

Another interesting dataset solution for machine learning is +Activeloop. This tool is +particularly well equipped to work with data and explore samples without +needing to download it.

+

7 - Data Labeling

+

+

No Labeling Required

+

The first thing to talk about when it comes to labeling data +is...maybe we don\'t have to label data? There are a couple of +options here we will cover.

+

Self-supervised learning is a very important idea that allows you to +avoid painstakingly labeling all of your data. You can use parts of your +data to label other parts of your data. This is very common in NLP right +now. This is further covered in the foundation model lecture. The long +and short of it is that models can have elements of their data masked +(e.g., the end of a sentence can be omitted), and models can use earlier +parts of the data to predict the masked parts (e.g., I can learn from +the beginning of the sentence and predict the end). This can even be +used across modalities (e.g., computer vision and text), as OpenAI +CLIP demonstrates.

+

+

Image data augmentation is an almost compulsory technique to adopt, +especially for vision tasks. Frameworks like +torchvision help with +this. In data augmentation, samples are modified (e.g., brightened) +without actually changing their core "meaning." Interestingly, +augmentation can actually replace labels. +SimCLR +is a model that demonstrates this - where its learning objective is to +maximize agreement between augmented views of the same image and +minimize agreement between different images.

+

For other forms of data, there are a couple of augmentation tricks that +can be applied. You can delete some cells in tabular data to simulate +missing data. In text, there aren't established techniques, but ideas +include changing the order of words or deleting words. In speech, you +could change the speed, insert pauses, etc.

+

Synthetic data is an underrated idea. You can synthesize data based +on your knowledge of the label. For example, you can create +receipts +if your need is to learn how to recognize receipts from images. This can +get very sophisticated and deep, so tread carefully.

+

You can also get creative and ask your users to label data for you. +Google Photos, as any user of the app knows, regularly gets users to +label images about where people in photos are the same or different.

+

+

This is an example of the data flywheel. Improving the data allows the +user to improve the model, which in turn makes their product experience +better.

+

Labeling Solutions

+

These are all great options for avoiding labeling data. However, +you'll usually have to label some data to get started.

+

Labeling has standard annotation features, like bounding boxes, that +help capture information properly. Training annotators properly is more +important than the particular kind of annotation. Standardizing how +annotators approach a complex, opinable task is crucial. Labeling +guidelines can help capture the exact right label from an annotator. +Quality assurance is key to ensuring annotation and labeling are +happening properly.

+

There are a few options for sourcing labor for annotations:

+
    +
  1. +

    Full-service data labeling vendors offer end-to-end labeling +solutions.

    +
  2. +
  3. +

    You can hire and train annotators yourself.

    +
  4. +
  5. +

    You can crowdsource annotation on a platform like Mechanical Turk.

    +
  6. +
+

Full-service companies offer a great solution that abstracts the need +to build software, manage labor, and perform quality checks. It makes +sense to use one. Before settling on one, make sure to dedicate time to +vet several. Additionally, label some gold standard data yourself to +understand the data yourself and to evaluate contenders. Take calls with +several contenders, ask for work samples on your data, and compare them +to your own labeling performance.

+
    +
  • +

    Scale AI is the dominant data +labeling solution. It offers an API that allows you to spin up +tasks.

    +
  • +
  • +

    Additional contenders include +Labelbox and +Supervisely.

    +
  • +
  • +

    LabelStudio is an open-source +solution for performing annotation yourself, with a companion +enterprise version. It has a great set of features that allow you +to design your interface and even plug-in models for active +learning!

    +
  • +
  • +

    Diffgram is a competitor to +Label Studio.

    +
  • +
  • +

    Recent offerings, like +Aquarium and +Scale Nucleus, have +started to help concentrate labeling efforts on parts of the +dataset that are most troublesome for models.

    +
  • +
  • +

    Snorkel is a dataset management +and labeling platform that uses weak supervision, which is a +similar concept. You can leverage composable rules (e.g., all +sentences that have the term "amazing" are positive sentiments) +that allow you to quickly label data faster than if you were to +treat every data point the same.

    +
  • +
+

In conclusion, try to avoid labeling using techniques like +self-supervised learning. If you can't, use labeling software and +eventually outsource the work to the right vendor. If you can't afford +vendors, consider hiring part-time work rather than crowdsourcing the +work to ensure quality.

+

8 - Data Versioning

+

+

Data versioning comes with a spectrum of approaches:

+
    +
  1. +

    Level 0 is bad. In this case, data just lives on some file system. +In these cases, the issue arises because the models are +unversioned since their data is unversioned. Models are part code, +part data. This will lead to the consequence of being unable to +get back to a previous level of performance if need be.

    +
  2. +
  3. +

    You can prevent this event with Level 1, where you snapshot your +data each time you train. This somewhat works but is far from +ideal.

    +
  4. +
  5. +

    In Level 2, data is versioned like code, as a commingled asset with +versioned code. You can use a system like +git-lfs that allows +you to store large data assets alongside code. This works really +well!

    +
  6. +
  7. +

    Level 3 involves specialized solutions for working with large data +files, but this may not be needed unless you have a very specific +need (i.e., uniquely large or compliance-heavy files).

    +
  8. +
+

+

DVC is a great tool for this. DVC +helps upload your data asset to a remote storage location every time you +commit changes to the data file or trigger a commit; it functions like a +fancier git-lfs. It adds features like lineage for data and model +artifacts, allowing you to recreate pipelines.

+

Several techniques are associated with privacy-controlled data, like +federated +learning, +differential privacy, and learning on encrypted data. These techniques +are still in research, so they aren't quite ready for an FSDL +recommendation.

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-4-data-management/media/image1.png b/course/2022/lecture-4-data-management/media/image1.png new file mode 100644 index 00000000..a1c5b8ab Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image1.png differ diff --git a/course/2022/lecture-4-data-management/media/image10.png b/course/2022/lecture-4-data-management/media/image10.png new file mode 100644 index 00000000..55bf7d16 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image10.png differ diff --git a/course/2022/lecture-4-data-management/media/image11.png b/course/2022/lecture-4-data-management/media/image11.png new file mode 100644 index 00000000..84a1dd7a Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image11.png differ diff --git a/course/2022/lecture-4-data-management/media/image12.png b/course/2022/lecture-4-data-management/media/image12.png new file mode 100644 index 00000000..2469276f Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image12.png differ diff --git a/course/2022/lecture-4-data-management/media/image13.png b/course/2022/lecture-4-data-management/media/image13.png new file mode 100644 index 00000000..94424cb0 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image13.png differ diff --git a/course/2022/lecture-4-data-management/media/image14.png b/course/2022/lecture-4-data-management/media/image14.png new file mode 100644 index 00000000..858f0a45 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image14.png differ diff --git a/course/2022/lecture-4-data-management/media/image15.png b/course/2022/lecture-4-data-management/media/image15.png new file mode 100644 index 00000000..0ab3ffc5 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image15.png differ diff --git a/course/2022/lecture-4-data-management/media/image16.png b/course/2022/lecture-4-data-management/media/image16.png new file mode 100644 index 00000000..088e709b Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image16.png differ diff --git a/course/2022/lecture-4-data-management/media/image2.png b/course/2022/lecture-4-data-management/media/image2.png new file mode 100644 index 00000000..439fbba3 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image2.png differ diff --git a/course/2022/lecture-4-data-management/media/image3.png b/course/2022/lecture-4-data-management/media/image3.png new file mode 100644 index 00000000..d4e35be3 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image3.png differ diff --git a/course/2022/lecture-4-data-management/media/image4.png b/course/2022/lecture-4-data-management/media/image4.png new file mode 100644 index 00000000..790a52de Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image4.png differ diff --git a/course/2022/lecture-4-data-management/media/image5.png b/course/2022/lecture-4-data-management/media/image5.png new file mode 100644 index 00000000..b8070072 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image5.png differ diff --git a/course/2022/lecture-4-data-management/media/image6.png b/course/2022/lecture-4-data-management/media/image6.png new file mode 100644 index 00000000..ddb1005c Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image6.png differ diff --git a/course/2022/lecture-4-data-management/media/image7.png b/course/2022/lecture-4-data-management/media/image7.png new file mode 100644 index 00000000..a9fc907d Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image7.png differ diff --git a/course/2022/lecture-4-data-management/media/image8.png b/course/2022/lecture-4-data-management/media/image8.png new file mode 100644 index 00000000..24324432 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image8.png differ diff --git a/course/2022/lecture-4-data-management/media/image9.png b/course/2022/lecture-4-data-management/media/image9.png new file mode 100644 index 00000000..389ea253 Binary files /dev/null and b/course/2022/lecture-4-data-management/media/image9.png differ diff --git a/course/2022/lecture-5-deployment/index.html b/course/2022/lecture-5-deployment/index.html new file mode 100644 index 00000000..01f73cbd --- /dev/null +++ b/course/2022/lecture-5-deployment/index.html @@ -0,0 +1,3563 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 5: Deployment - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 5: Deployment

+
+ +
+ +

Lecture by Josh Tobin.
+Notes by James Le and Vishnu Rachakonda.
+Published September 5, 2022. +Download slides.

+

Introduction

+

+

Deploying models is a critical part of making your models good, to begin +with. When you only evaluate the model offline, it's easy to miss the +more subtle flaws that the model has, where it doesn't actually solve +the problem that your users need it to solve. Oftentimes, when we deploy +a model for the first time, only then do we really see whether that +model is actually doing a good job or not. Unfortunately, for many data +scientists and ML engineers, model deployment is an afterthought +relative to other techniques we have covered.

+

Much like other parts of the ML lifecycle, we'll focus on deploying a +minimum viable model as early as possible, which entails keeping it +simple and adding complexity later. Here is the process that this +lecture covers:

+
    +
  • +

    Build a prototype

    +
  • +
  • +

    Separate your model and UI

    +
  • +
  • +

    Learn the tricks to scale

    +
  • +
  • +

    Consider moving your model to the edge when you really need to go +fast

    +
  • +
+

1 - Build a Prototype To Interact With

+

There are many great tools for building model prototypes. +HuggingFace has some tools +built into its playground. They have also recently acquired a startup +called Gradio, which makes it easy +to wrap a small UI around the model. +Streamlit is another good option +with a bit more flexibility.

+

+

Here are some best practices for prototype deployment:

+
    +
  1. +

    Have a basic UI: The goal at this stage is to play around with +the model and collect feedback from other folks. Gradio and +Streamlit are your friends here - often as easy as adding a couple +of lines of code to create a simple interface for the model.

    +
  2. +
  3. +

    Put it behind a web URL: An URL is easier to share. Furthermore, +you will start thinking about the tradeoffs you'll be making when +dealing with more complex deployment schemes. There are cloud +versions of Streamlit +and HuggingFace for this.

    +
  4. +
  5. +

    Do not stress it too much: You should not take more than a day +to build a prototype.

    +
  6. +
+

A model prototype won't be your end solution to deploy. Firstly, a +prototype has limited frontend flexibility, so eventually, you want to +be able to build a fully custom UI for the model. Secondly, a prototype +does not scale to many concurrent requests. Once you start having users, +you'll hit the scaling limits quickly.

+

+

Above is an abstract diagram of how your application might look. The +client is your user's device that interacts with your application. +This device can be a browser, a vehicle, or a mobile phone. This device +calls over a network to a server. The server talks to a database +(where data is stored), used to power the application.

+

+

There are different ways of structuring your application to fit an ML +model inside. The prototype approach mentioned in the beginning fits +into the model-in-service approach - where your hosted web server +has a packaged version of the model sitting inside it. This pattern has +pros and cons.

+

The biggest pro is that if you are doing something complex, you get to +reuse your existing infrastructure. It does not require you as a model +developer to set up new things from scratch.

+

However, there is a number of pronounced cons:

+
    +
  1. +

    Your web server may be written in a different language, so +getting your model into that language can be difficult.

    +
  2. +
  3. +

    Models may change more frequently than server code (especially +early in the lifecycle of building your model). If you have a +well-established application and a nascent model, you do not want +to redeploy the entire application every time that you make an +update to the model (sometimes multiple updates per day).

    +
  4. +
  5. +

    If you have a large model to run inference on, you'll have to load +that model on your web server. Large models can eat into the +resources for your web server. That might affect the user +experience for people using that web server, even if they are not +interacting with the model.

    +
  6. +
  7. +

    Server hardware is generally not optimized for ML workloads. In +particular, you rarely will have a GPU on these devices.

    +
  8. +
  9. +

    Your model and application may have different scaling +properties, so you might want to be able to scale them +differently.

    +
  10. +
+

2 - Separate Your Model From Your UI

+

2.1 - Batch Prediction

+

+

The first pattern to pull your model from your UI is called batch +prediction. You get new data in and run your model on each data point. +Then, you save the results of each model inference into a database. This +can work well under some circumstances. For example, if there are not a +lot of potential inputs to the model, you can re-run your model on some +frequency (every hour, every day, or every week). You can have +reasonably fresh predictions to return to those users that are stored in +your database. Examples of these problems include the early stages of +building recommender systems and internal-facing tools like marketing +automation.

+

To run models on a schedule, you can leverage the data processing and +workflow tools mentioned in our previous lecture on data management. You +need to re-run data processing, load the model, run predictions, and +store those predictions in your database. This is exactly a Directed +Acyclic Graph workflow of data operations that tools like +Dagster, +Airflow, or +Prefect are designed to solve. +It's worth noting that there are also tools like +Metaflow that are designed more +for ML or data science use cases that might be potentially even an +easier way to get started.

+

Let's visit the pros and cons of this batch prediction pattern. Starting +with the pros:

+
    +
  1. +

    Batch prediction is simple to implement since it reuses existing +batch processing tools that you may already be using for training +your model.

    +
  2. +
  3. +

    It scales very easily because databases have been engineered for +decades for such a purpose.

    +
  4. +
  5. +

    Even though it looks like a simple pattern, it has been used in +production by large-scale production systems for years. This is +a tried-and-true pattern you can run and be confident that it'll +work well.

    +
  6. +
  7. +

    It is fast to retrieve the prediction since the database is +designed for the end application to interact with.

    +
  8. +
+

Switching to the cons:

+
    +
  1. +

    Batch prediction doesn't scale to complex input types. For +instance, if the universe of inputs is too large to enumerate +every single time you need to update your predictions, this won't +work.

    +
  2. +
  3. +

    Users won't be getting the most up-to-date predictions from your +model. If the feature that goes into your model changes every +hour, minute, or subsecond, but you only run your batch prediction +job every day, the predictions your users see might be slightly +stale.

    +
  4. +
  5. +

    Models frequently become "stale." If your batch jobs fail for +some reason, it can be hard to detect these problems.

    +
  6. +
+

2.2 - Model-as-Service

+

The second pattern is called model-as-service: we run the model +online as its own service. The service is going to interact with the +backend or the client itself by making requests to the model service and +receiving responses back.

+

+

The pros of this pattern are:

+
    +
  1. +

    Dependability - model bugs are less likely to crash the web +application.

    +
  2. +
  3. +

    Scalability - you can choose optimal hardware for the model and +scale it appropriately.

    +
  4. +
  5. +

    Flexibility - you can easily reuse a model across multiple +applications.

    +
  6. +
+

The cons of this pattern are:

+
    +
  1. +

    Since this is a separate service, you add a network call when your +server or client interacts with the model. That can add +latency to your application.

    +
  2. +
  3. +

    It also adds infrastructural complexity because you are on the +hook for hosting and managing a separate service.

    +
  4. +
+

Even with these cons, the model-as-service pattern is still a sweet +spot for most ML-powered products since you really need to be able to +scale independently of the application in most complex use cases. We'll +walk through the basic components of building your model service - +including REST APIs, dependency management, performance optimization, +horizontal scaling, rollout, and managed options.

+

REST APIs

+

Rest APIs serve predictions in response to canonically-formatted +HTTP requests. There are other alternative protocols to interact with a +service that you host on your infrastructures, such as +GRPC (used in TensorFlow Serving) and +GraphQL (common in web development +but not terribly relevant to model services).

+

+

Unfortunately, there is currently no standard for formatting requests +and responses for REST API calls.

+
    +
  1. +

    Google Cloud expects a +batch of inputs structured as a list called "instances" (with keys +and values).

    +
  2. +
  3. +

    Azure expects a +list of things called "data", where the data structure itself +depends on what your model architecture is.

    +
  4. +
  5. +

    AWS Sagemaker +expects instances that are formatted differently than they are in +Google Cloud.

    +
  6. +
+

Our aspiration for the future is to move toward a standard interface +for making REST API calls for ML services. Since the types of data +that you might send to these services are constrained, we should be able +to develop a standard as an industry.

+

Dependency Management

+

Model predictions depend on code, model weights, and +dependencies. In order for your model to make a correct prediction, +all of these dependencies need to be present on your web server. +Unfortunately, dependencies are a notorious cause of trouble as it is +hard to ensure consistency between your development environment and your +server. It is also hard to update since even changing a TensorFlow +version can change your model.

+

At a high level, there are two strategies for managing dependencies:

+
    +
  1. +

    Constrain the dependencies for your model by saving your model +in an agnostic format that can be run anywhere.

    +
  2. +
  3. +

    Use containers to constrain the entire inference program.

    +
  4. +
+

+
Constraining Model Dependencies
+

The primary way to constrain the dependencies of just your model is +through a library called ONNX - the +Open Neural Network Exchange. The goal of ONNX is to be an +interoperability standard for ML models. The promise is that you can +define a neural network in any language and run it consistently +anywhere. The reality is that since the underlying libraries used to +build these models change quickly, there are often bugs in the +translation layer, which creates even more problems to solve for you. +Additionally, ONNX doesn't deal with non-library code such as feature +transformations.

+
Containers
+

To understand how to manage dependencies with containers, we need to +understand the differences between Docker and Virtual +Machines, +how Docker images are built via Docker files and constructed via layers, +the ecosystem around Docker, and specific wrappers around Docker that +you can use for ML.

+

+

In a virtual machine, you package up the entire operating system +(OS) as well as the libraries and applications that are built on top of +that OS. A virtual machine tends to be very heavyweight because the OS +itself has a lot of code and is expensive to run. A container such +as Docker removes that need by packaging the libraries and applications +together. A Docker engine that runs on top of your OS knows how to +virtualize the OS and run the libraries/applications.

+

By virtue of being lightweight, Docker is used differently than how +Virtual Machines were used. A common pattern is to spin up a new +Docker container +for every discrete task. For example, a web application might have four +containers: a web server, a database, a job queue, and a worker. These +containers are run together as part of an orchestration system.

+

+

Docker containers are created from Docker +files. +Each Docker file runs a sequence of steps to define the environment +where you will run your code. Docker also allows you to build, store, +and pull Docker containers from a Docker Hub that is hosted on some +other servers or your cloud. You can experiment with a code environment +that is on your local machine but will be identical to the environment +you deploy on your server.

+

Docker is separated into three different +components:

+
    +
  1. +

    The client is where you'll be running on your laptop to build an +image from a Dockerfile that you define locally using some +commands.

    +
  2. +
  3. +

    These commands are executed by a Docker Host, which can run on +either your laptop or your server (with more storage or more +performance).

    +
  4. +
  5. +

    That Docker Host talks to a registry - which is where all the +containers you might want to access are stored.

    +
  6. +
+

+

With this separation of concerns, you are not limited by the amount of +compute and storage you have on your laptop to build, pull, and run +Docker images. You are also not limited by what you have access to on +your Docker Host to decide which images to run.

+

In fact, there is a powerful ecosystem of Docker images that are +available on different public Docker Hubs. You can easily find these +images, modify them, and contribute them back to the Hubs. It's easy to +store private images in the same place as well. Because of this +community and the lightweight nature of Docker, it has become +incredibly popular in recent +years +and is ubiquitous at this point.

+

There is a bit of a learning curve to Docker. For ML, there are a few +open-source packages designed to simplify this: +Cog, +BentoML, and +Truss. They are built by +different model hosting providers that are designed to work well with +their model hosting service but also just package your model and all of +its dependencies in a standard Docker container format.

+

+

These packages have two primary components: The first one is a +standard way of defining your prediction service. The second one is a +YAML file that defines the other dependencies and package versions that +will go into the Docker container running on your laptop or remotely.

+

If you want to have the advantages of using Docker for making your ML +models reproducible but do not want to go through the learning curve of +learning Docker, it's worth checking out these three libraries.

+

Performance Optimization

+
+

What about performance monitoring?

+

In this section, we focus on ways to improve the performance of your +models, but we spend less time on how exactly that performance is monitored, +which is a challenge in its own right.

+

Luckily, one of the +student projects for the 2022 cohort, +Full Stack Stable Diffusion, +took up that challenge and combined +NVIDIA's Triton Inference Server, +the Prometheus monitoring tool, +and +the Grafana analytics dashboarding tool +to monitor a robust, scalable, and observable deployment of Stable Diffusion models.

+

Check out the repo on GitHub +here +if you want to see a worked example of a fully-monitored DL-powered application.

+
+

To make model inference on your machine more efficient, we need to +discuss GPU, concurrency, model distillation, quantization, caching, +batching, sharing the GPU, and libraries that automate these tasks for +you.

+
GPU or no GPU?
+

There are some advantages to hosting your model on a GPU:

+
    +
  1. +

    It's probably the same hardware you train your model on, to begin +with. That can eliminate any lost-in-translation issues.

    +
  2. +
  3. +

    As your model gets big and your techniques get advanced, your +traffic gets large. GPUs provide high throughput to deal with +that.

    +
  4. +
+

However, GPUs introduce a lot of complexity:

+
    +
  1. +

    They are more complex to set up.

    +
  2. +
  3. +

    They are more expensive.

    +
  4. +
+

As a result, just because your model is trained on a GPU does not mean +that you need to actually host it on a GPU in order for it to work. In +the early version of your model, hosting it on a CPU should suffice. In +fact, it's possible to get high throughput from CPU inference at a low +cost by using some other techniques.

+
Concurrency
+

With concurrency, multiple copies of the model run in parallel on +different CPUs or cores on a single host machine. To do this, you need +to be careful about thread tuning. There's a great Roblox +presentation +on how they scaled BERT to serve a billion daily requests, just using +CPUs.

+
Model Distillation
+

With model distillation, once you have a large model that you've +trained, you can train a smaller model that imitates the behavior of +your larger one. This entails taking the knowledge that your larger +model learned and compressing that knowledge into a much smaller model +that you may not have trained to the same degree of performance from +scratch. There are several model distillation techniques pointed out in +this blog +post. +They can be finicky to do by yourself and are infrequently used in +practice. An exception is distilled versions of popular models (such as +DistilBERT).

+
Quantization
+

With quantization, you execute some or potentially all of the +operations in your model in a lower fidelity representation of the +numbers that you are doing the math. These representations can be 16-bit +floating point numbers or 8-bit integers. This introduces some tradeoffs +with accuracy, but it's worth making these tradeoffs because the +accuracy you lose is limited relative to the performance you gain.

+

The recommended path is to use built-in quantization methods in +PyTorch +and TensorFlow. More specifically, HuggingFace +Optimum is a good +choice if you have already been using HuggingFace's pre-trained models. +You can also run quantization-aware training, which often results in +higher accuracy.

+

+
Caching
+

With caching, you realize that for some ML models, some inputs are +more common than others. Instead of always calling the model every time +a user makes a request, let's store the common requests in a cache. +Then, let's check that cache before running an expensive operation. +Caching techniques can get fancy, but the basic way of doing this is to +use functools library in +Python.

+

+
Batching
+

With batching, you take advantage of the fact that ML models often +achieve a higher throughput when doing prediction in parallel, +especially in a GPU. To accomplish this, you need to gather predictions +until you have a batch, run those predictions, and return them to your +user. You want to tune the batch size that deals optimally with the +latency-throughput tradeoff. You also need to have a way to shortcut the +process if latency becomes too long. Batching is complicated to +implement, so you probably do not want to implement this yourself.

+
Sharing the GPU
+

Your model may not take up all of the GPU memory with your inference +batch size. Why don't you run multiple models on the same GPU? This +is a place where you want to use a model serving solution that supports +GPU sharing out of the box.

+
Libraries
+

There are offerings from TensorFlow, PyTorch, and third-party tools from +NVIDIA and Anyscale. NVIDIA's choice is probably the most powerful but +can be difficult to get started with. Starting with Anyscale's Ray +Serve may +be an easier way to get started.

+

+

Horizontal Scaling

+

If you're going to scale up to a large number of users interacting with +your model, it's not going to be enough to get the most efficiency out +of one server. At some point, you'll need to scale horizontally to have +traffic going to multiple copies of your model running on different +servers. This is called horizontal scaling. This technique involves +taking traffic that would usually go to a single machine and splits +across multiple machines.

+

Each machine has a copy of the service, and a tool called a load +balancer distributes traffic to each machine. In practice, there are two +ways to do this: with either container orchestration (e.g. +Kubernetes) or serverless (e.g. AWS Lambda).

+
Container Orchestration
+

In container orchestration, we use +Kubernetes to help manage +containerized applications (in Docker containers, for example) and run +them across machines.

+

+

Kubernetes is quite interesting, but it's probably overkilled to learn +too much about it if your only goal is to deploy machine learning +models. There are a number of frameworks that make it easiest to deploy +ML models with Kubernetes, including +Kubeflow, +Seldon, etc.

+
Serverless
+

If Kubernetes isn't the path for you (e.g. you don't want to have to +worry about infrastructure at all), serverless is another option for +deploying models. In this paradigm, app code and dependencies are +packaged into .zip files or Docker containers with a single entry point +function, which is a single function (e.g. model.predict()) that will +be run repeatedly. This package is then deployed to a service like AWS +Lambda, which almost +totally manages the infrastructure required to run the code based on the +input. Scaling to thousands of requests and across multiple machines is +taken care of by these services. In return, you pay for the compute time +that you consume.

+

Since model services tend to run discretely and not continuously (like a +web server), serverless is a great fit for machine learning deployment.

+

+

Start with serverless! It's well worth the time saved in managing +infrastructure and dealing with associated challenges. There are still +some problems you should be aware of though.

+
    +
  1. +

    First, the size of the actual deployment package that can be sent to +a serverless service tends to be limited, which makes large models +impossible to run.

    +
  2. +
  3. +

    Second, there is also a cold start problem. If there is no traffic +being sent to the service in question, the service will "wind +down" to zero compute use, at which point it takes time to start +again. This lag in starting up upon the first request to the +serverless service is known as the "cold start" time. This can +take seconds or even minutes.

    +
  4. +
  5. +

    Third, it can be hard to actually build solid software engineering +concepts, like pipelines, with serverless. Pipelines enable rapid +iteration, while serverless offerings often do not have the tools +to support rapid, automated changes to code of the kind pipelines +are designed to do.

    +
  6. +
  7. +

    Fourth, state management and deployment tooling are related +challenges here.

    +
  8. +
  9. +

    Finally, most serverless functions are CPU only and have limited +execution time. If you need GPUs for inference, serverless might +not be for you quite yet. There are, however, new offerings like +Banana and +Pipeline that are +seeking to solve this problem of serverless GPU inference!

    +
  10. +
+

Model Rollouts

+

If serving is how you turn a model into something that can respond to +requests, rollouts are how you manage and update these services. To be +able to make updates effectively, you should be able to do the +following:

+
    +
  1. +

    Roll out gradually: You may want to incrementally send traffic +to a new model rather than the entirety.

    +
  2. +
  3. +

    Roll back instantly: You may want to immediately pull back a +model that is performing poorly.

    +
  4. +
  5. +

    Split traffic between versions: You may want to test differences +between models and therefore send some traffic to each.

    +
  6. +
  7. +

    Deploy pipelines of models: Finally, you may want to have entire +pipeline flows that ensure the delivery of a model.

    +
  8. +
+

Building these capabilities is a reasonably challenging infrastructure +problem that is beyond the scope of this course. In short, managed +services are a good option for this that we'll now discuss!

+

Managed Options

+

All of the major cloud providers offer their managed service options for +model deployment. There are a number of startups offering solutions as +well, like BentoML or Banana.

+

+

The most popular managed service is AWS +Sagemaker. Working with +Sagemaker is easier if your model is already in a common format like a +Huggingface class or a SciKit-Learn model. Sagemaker has convenient +wrappers for such scenarios. Sagemaker once had a reputation for being a +difficult service to work with, but this is much less the case for the +clear-cut use case of model inference. Sagemaker, however, does have +real drawbacks around ease of use for custom models and around cost. In +fact, Sagemaker instances tend to be 50-100% more expensive than EC2.

+

2.3 - Takeaways

+

To summarize this section, remember the following:

+
    +
  1. +

    You probably don't need GPU inference, which is hard to access and +maintain. Scaling CPUs horizontally or using serverless can +compensate.

    +
  2. +
  3. +

    Serverless is probably the way to go!

    +
  4. +
  5. +

    Sagemaker is a great way to get started for the AWS user, but it can +get quite expensive.

    +
  6. +
  7. +

    Don't try to do your own GPU inference; use existing tools like +TFServing or Triton to save time.

    +
  8. +
  9. +

    Watch out for new startups focused on GPU inference.

    +
  10. +
+

3 - Move to the Edge?

+

Let's now consider the case of moving models out of web service and all +the way to the "edge", or wholly on-device. Some reasons you may need to +consider this include a lack of reliable internet access for users or +strict data security requirements.

+

If such hard and fast requirements aren't in place, you'll need to take +into account the tradeoff between accuracy and latency and how this can +affect the end-user experience. Put simply, if you have exhausted all +options to reduce model prediction time (a component of latency), +consider edge deployment.

+

+

Edge deployment adds considerable complexity, so it should be considered +carefully before being selected as an option. In edge prediction, model +weights are directly loaded on our client device after being sent via a +server (shown above), and the model is loaded and interacted with +directly on the device.

+

This approach has compelling pros and cons:

+
    +
  1. +

    Some pros to particularly call out are the latency advantages that +come without the need for a network and the ability to scale for +"free," or the simple fact that you don't need to worry about the +challenges of running a web service if all inference is done +locally.

    +
  2. +
  3. +

    Some specific cons to call out are the often limited hardware and +software resources available to run machine learning models on +edge, as well as the challenge of updating models since users +control this process more than you do as the model author.

    +
  4. +
+

3.1 - Frameworks

+

Picking the right framework to do edge deployment depends both on how +you train your model and what the target device you want to deploy it on +is.

+
    +
  • +

    TensorRT: If +you're deploying to NVIDIA, this is the choice to go with.

    +
  • +
  • +

    MLKit and +CoreML: +For phone-based deployment on either Android or iPhone, go +with MLKit for the former and CoreML for the latter.

    +
  • +
  • +

    PyTorch Mobile: For +compatibility with both iOS and Android, use PyTorch Mobile.

    +
  • +
  • +

    TFLite: A great +choice for using TensorFlow in a variety of settings, not just on +a phone or a common device.

    +
  • +
  • +

    TensorFlow JS: +The preferred framework for deploying machine learning in the +browser.

    +
  • +
  • +

    Apache TVM: A library +agnostic, target device agnostic option. This is the choice for +anyone trying to deploy to as diverse a number of settings as +possible.

    +
  • +
+

Keep paying attention to this space! There are a lot of startups like +MLIR, +OctoML, +TinyML, and +Modular that are aiming to +solve some of these problems.

+

3.2 - Efficiency

+

No software can help run edge-deployed models that are simply too large; +model efficiency is important for edge deployment! We previously +discussed quantization and distillation as options for model efficiency. +However, there are also network architectures specifically designed to +work better in edge settings like +MobileNets. +MobileNets replace the more expensive computations typical of server-run +models with simpler computations and achieve acceptable performance +oftentimes.

+

+

MobileNets are a great tool for model deployments and are a great case +study in model efficiency. Another similarly great case study is +DistillBERT.

+

+

3.3 - Mindsets

+

As we wrap up this lecture, keep in mind the following mindsets as you +consider edge deployment:

+
    +
  1. +

    Start with the edge requirement, not the architecture choice. +It's easy to pick a high-performing model architecture, only to +then find it impossible to run on the edge device. Avoid this +scenario at all costs! Tricks like quantization can account for up +to 10x improvement, but not much more.

    +
  2. +
  3. +

    Once you have a model that works on the edge, you can iterate +locally without too much additional re-deployment. In this case, +make sure to add metrics around the model size and edge +performance to your experiment tracking.

    +
  4. +
  5. +

    Treat tuning the model as an additional risk and test +accordingly. With the immaturity of edge deployment frameworks, +it's crucial to be especially careful when testing your model on +the exact hardware you'll be deploying on.

    +
  6. +
  7. +

    Make sure to have fallbacks! Models are finicky and prone to +unpredictable behavior. In edge cases, it's especially important +to have easily available fallback options for models that aren't +working.

    +
  8. +
+

3.4 - Conclusion

+

To summarize this section:

+
    +
  1. +

    Web deployment is easier, so use edge deployment only if you need +to.

    +
  2. +
  3. +

    Choose your framework to match the available hardware and +corresponding mobile frameworks, or try Apache TVM to be more +flexible.

    +
  4. +
  5. +

    Start considering hardware constraints at the beginning of the +project and choose architectures accordingly.

    +
  6. +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-5-deployment/media/image1.png b/course/2022/lecture-5-deployment/media/image1.png new file mode 100644 index 00000000..4a1dab01 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image1.png differ diff --git a/course/2022/lecture-5-deployment/media/image10.png b/course/2022/lecture-5-deployment/media/image10.png new file mode 100644 index 00000000..c6b2c222 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image10.png differ diff --git a/course/2022/lecture-5-deployment/media/image11.png b/course/2022/lecture-5-deployment/media/image11.png new file mode 100644 index 00000000..efb14f0a Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image11.png differ diff --git a/course/2022/lecture-5-deployment/media/image12.png b/course/2022/lecture-5-deployment/media/image12.png new file mode 100644 index 00000000..2d091793 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image12.png differ diff --git a/course/2022/lecture-5-deployment/media/image13.png b/course/2022/lecture-5-deployment/media/image13.png new file mode 100644 index 00000000..1f0d7efd Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image13.png differ diff --git a/course/2022/lecture-5-deployment/media/image14.png b/course/2022/lecture-5-deployment/media/image14.png new file mode 100644 index 00000000..bb1187d0 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image14.png differ diff --git a/course/2022/lecture-5-deployment/media/image15.png b/course/2022/lecture-5-deployment/media/image15.png new file mode 100644 index 00000000..df17c478 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image15.png differ diff --git a/course/2022/lecture-5-deployment/media/image16.png b/course/2022/lecture-5-deployment/media/image16.png new file mode 100644 index 00000000..673346fb Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image16.png differ diff --git a/course/2022/lecture-5-deployment/media/image17.png b/course/2022/lecture-5-deployment/media/image17.png new file mode 100644 index 00000000..fcf69281 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image17.png differ diff --git a/course/2022/lecture-5-deployment/media/image18.png b/course/2022/lecture-5-deployment/media/image18.png new file mode 100644 index 00000000..519ce8c6 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image18.png differ diff --git a/course/2022/lecture-5-deployment/media/image19.png b/course/2022/lecture-5-deployment/media/image19.png new file mode 100644 index 00000000..2836f80e Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image19.png differ diff --git a/course/2022/lecture-5-deployment/media/image2.png b/course/2022/lecture-5-deployment/media/image2.png new file mode 100644 index 00000000..a2c60d08 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image2.png differ diff --git a/course/2022/lecture-5-deployment/media/image20.png b/course/2022/lecture-5-deployment/media/image20.png new file mode 100644 index 00000000..a48b882d Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image20.png differ diff --git a/course/2022/lecture-5-deployment/media/image21.png b/course/2022/lecture-5-deployment/media/image21.png new file mode 100644 index 00000000..8e931c5b Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image21.png differ diff --git a/course/2022/lecture-5-deployment/media/image3.png b/course/2022/lecture-5-deployment/media/image3.png new file mode 100644 index 00000000..4ad9a7d6 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image3.png differ diff --git a/course/2022/lecture-5-deployment/media/image4.png b/course/2022/lecture-5-deployment/media/image4.png new file mode 100644 index 00000000..e5b38dd0 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image4.png differ diff --git a/course/2022/lecture-5-deployment/media/image5.png b/course/2022/lecture-5-deployment/media/image5.png new file mode 100644 index 00000000..2b99d04b Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image5.png differ diff --git a/course/2022/lecture-5-deployment/media/image6.png b/course/2022/lecture-5-deployment/media/image6.png new file mode 100644 index 00000000..ad2a96a1 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image6.png differ diff --git a/course/2022/lecture-5-deployment/media/image7.png b/course/2022/lecture-5-deployment/media/image7.png new file mode 100644 index 00000000..e6ab04b5 Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image7.png differ diff --git a/course/2022/lecture-5-deployment/media/image8.png b/course/2022/lecture-5-deployment/media/image8.png new file mode 100644 index 00000000..63c1383b Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image8.png differ diff --git a/course/2022/lecture-5-deployment/media/image9.png b/course/2022/lecture-5-deployment/media/image9.png new file mode 100644 index 00000000..90037a9a Binary files /dev/null and b/course/2022/lecture-5-deployment/media/image9.png differ diff --git a/course/2022/lecture-6-continual-learning/index.html b/course/2022/lecture-6-continual-learning/index.html new file mode 100644 index 00000000..1c3b569e --- /dev/null +++ b/course/2022/lecture-6-continual-learning/index.html @@ -0,0 +1,3474 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 6: Continual Learning - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 6: Continual Learning

+
+ +
+ +

Lecture by Josh Tobin. +Notes by James Le and Vishnu Rachakonda.
+Published September 12, 2022. +Download slides.

+

1 - Overview

+

The core justification for continual learning is that, unlike in +academia, we never deal with static data distributions in the real +world. The implication is that: if you want to use ML in production +and build ML-powered products, you need to think about your goal of +building a continual learning system, not just a static model.

+

Recalling the data flywheel that we've described in this class before: +as you get more users, those users bring more data. You can use the data +to make a better model. A better model helps you attract even more users +and build a better model over time. Andrej Karpathy described the most +optimistic version of it as "Operation +Vacation" - +if we make our continual learning system good enough, it'll get better +on its own over time, and ML engineers can just go on vacation.

+

+

The reality is quite different. Initially, we gather, clean, and label +some data. We train a model on that data. Then we evaluate the model and +loop back to training the model to improve it based on our evaluations. +Finally, we get a minimum viable model and deploy it into production.

+

+

The problem begins after we deploy the model: we generally don't have a +great way of measuring how our models are actually performing in +production. Often, we just spot-check some predictions to see if they +are doing what they are supposed to do. If it seems to work, then it's +great. We move on to work on other things.

+

+

Unfortunately, the ML engineer is probably not the one who discovers the +problems, to begin with. Some business user or product manager gets +complaints from users about a dipping metric, which leads to an +investigation. This already costs the company money because the product +and business teams must investigate the problem.

+

+

Eventually, they point back to the ML engineer and the model he is +responsible for. At this point, we are stuck on doing ad-hoc analyses +because we don't know what caused the model failure. Eventually, we can +run a bunch of SQL queries and paste together some Jupyter notebooks to +figure out what the problem is. If we are lucky, we can run an A/B test. +If the test looks good, we'll deploy it into production. Then, we are +back to where we started - not getting ongoing feedback about how the +model is doing in production.

+

The upshot is that continual learning is the least well-understood +part of the production ML lifecycle. Very few companies are doing this +in production today. This lecture focuses on how to improve different +steps of the continual learning process, pointers to learn about each +step, and recommendations for doing it pragmatically and adopting it +gradually.

+

2 - How to Think About Continual Learning

+

Our opinionated view about continual learning is training a sequence +of models that can adapt to a continuous stream of data that comes into +production. You can think about continual learning as an outer loop in +your training process. On one end of the loop is your application, which +consists of a model and some other code that users interact with that +application by submitting requests, getting predictions back, and +submitting feedback about how well the model did at providing that +prediction.

+

The continual learning loop starts with logging, which is how we get +all the data into the loop. Then we have data curation, triggers +for the retraining process, dataset formation to pick the data to +retrain on, the training process itself, and offline testing to +validate whether the retrained model is good enough to go into +production. After the model is deployed, we have online testing, and +that brings the next version of the model into production, where we can +start the loop all over.

+

Each of these stages passes the output to the next step. Output is +defined by a set of rules. These rules combine to form our retraining +strategy. Let's discuss what the retraining strategy looks like for +each stage:

+

+

At the logging stage, the key question answered by the retraining +strategy is what data should we store? At the end of this stage, we +have an "infinite stream" of potentially unlabeled data coming from +production and can be used for downstream analysis.

+

+

At the curation stage, the key rules we need to define are what +data from that infinite stream will we prioritize for labeling and +potential retraining? At the end of this stage, we have a reservoir of +candidate training points that have labels and are fully ready to be fed +back into a training process.

+

+

At the retraining trigger stage, the key question is when should +we retrain? The output of this stage is a signal to kick off a +retraining job.

+

+

At the dataset formation stage, the key rules we need to define are +from this entire reservoir of data, what specific subset of that data +are we using to train on for this particular training job? The output +of this stage is a view into that reservoir or training data that +specifies the exact data points to be used for the training job.

+

+

At the offline testing stage, the key rule we need to define is +what "good enough" looks like for all stakeholders. The output of +this stage is equivalent to a "pull request" report card for your model +with a clear sign-off process. Once you are signed off, the new model +will roll out into production.

+

+

Finally, at the deployment and online testing stage, the key rule to +define is how do we know if this deployment was successful? The +output of this stage is a signal to roll this model out fully to all of +your users.

+

In an idealized world, from an ML engineer's perspective, once the model +is deployed, the first version of the model is to not retrain the model +directly. Instead, we want the model to sit on top of the retraining +strategy and try to improve that strategy over time. Rather than +training models daily, we look at metrics about how well the strategy is +working and how well it's solving the task of improving our model over +time in response to changes in the world. The input that we provide is +by tuning the strategy to do a better job of solving that task.

+

For most ML engineers, our jobs don't feel like that at a high level. +Our retraining strategy is just retraining models whenever we feel +like it. We can get good results from ad-hoc retraining, but when you +start getting consistent results and no one is actively working on the +model day to day anymore, then it's worth starting to add some +automation. Alternatively, if you find yourself needing to retrain the +model more than once a week (or even more frequently than that) to deal +with changing results in the real world, then it's worth investing in +automation just to save yourself.

+

3 - Periodic Retraining

+

The first baseline retraining strategy that you should consider after +you move on from ad-hoc is just periodic retraining:

+
    +
  1. +

    At the logging stage, we simply log everything.

    +
  2. +
  3. +

    At the curation stage, we sample uniformly at random from the data +that we've logged up until we get the maximum number of data +points that we are able to handle. Then we label them using some +automated tools.

    +
  4. +
  5. +

    Our retraining trigger will just be periodic.

    +
  6. +
  7. +

    We train once a week, but we do it on the last month's data, for +example.

    +
  8. +
  9. +

    Then we compute the test set accuracy after each training, set a +threshold on that, or more likely manual review the results each +time, and spot-check some of the predictions.

    +
  10. +
  11. +

    When we deploy the model, we do spot evaluations of that deployed +model on a few individual predictions to make sure things look +healthy.

    +
  12. +
+

+

Periodic retraining won't work in every circumstance. There are several +failure modes:

+
    +
  1. +

    The first category is that you have more data than you can log or +label. If you have a high volume of data, you might need to be +more careful about what data to sample and enrich, particularly if +that data comes from a long-tail distribution - where you have +edge cases that your model needs to perform well on, but those +edge cases might not be caught by just doing standard uniform +sampling. Or if that data is expensive to label like in a +human-in-the-loop scenario - where you need custom labeling +rules or labeling is a part of the product. In either of those +cases, you need to be more careful about what subset of your data +you log and enrich to be used down the road.

    +
  2. +
  3. +

    The second category has to do with managing the cost of +retraining. If your model is expensive to retrain, retraining it +periodically is not going to be the most cost-efficient way to go, +especially if you do it on a rolling window of data every single +time. You will leave a lot of performance on the table by not +retraining more frequently. You can partially solve this by +increasing the retraining frequency, but this will increase the +costs even further.

    +
  4. +
  5. +

    The final failure mode is situations where you have a high cost of +bad predictions. Every time you retrain your model, it +introduces risk, which comes from the fact that the data you're +training the model on might be bad in some way. It might be +corrupted, might have been attacked by an adversary, or might not +be representative anymore of all the cases that your model needs +to perform well on. The more frequently you retrain and the more +sensitive you are to model failures, the more thoughtful you need +to be about careful model evaluation such that you are not unduly +taking on too much risk from frequent retraining.

    +
  6. +
+

4 - Iterating On Your Retraining Strategy

+

The main takeaway from this section is that we will use monitoring and +observability to determine what changes we want to make to our +retraining strategy.

+
    +
  1. +

    We'll do that by monitoring just the metrics that actually that +matter and using all other metrics for debugging.

    +
  2. +
  3. +

    When we debug an issue with our model, that will lead to potentially +retraining our model. But more broadly than that, we can think of +it as a change to the retraining strategy - changing our +retraining triggers, our offline tests, our sampling strategies, +the metrics for observability, etc.

    +
  4. +
  5. +

    As we get more confident in our monitoring, we can introduce more +automation to our system.

    +
  6. +
+

There are no real standards or best practices on model monitoring yet. +The main principles we'll follow are: (1) We'll focus on monitoring what +matters and what breaks empirically; and (2) We'll compute other signals +too but use them for observability and debugging.

+

+

What does it mean to monitor a model in production? We think about it +as: You have some metric to assess the model quality (i.e, accuracy) and +a time series of how that metric changes over time. The question you try +to answer is: Is this bad or okay? Do you need to pay attention to +this degradation or not?

+

The questions we'll need to answer are:

+
    +
  1. +

    What metrics should we be looking at when we are monitoring?

    +
  2. +
  3. +

    How can we tell if those metrics are bad and warrant an +intervention?

    +
  4. +
  5. +

    What are the tools that help us with this process?

    +
  6. +
+

What Metrics to Monitor

+

Choosing the right metric to monitor is probably the most important part +of this process. Below you can find different types of metrics ranked in +order of how valuable they are.

+

+

Outcomes and Feedback From Users

+

The most valuable one to look at is outcome data or feedback from your +users. Unfortunately, there are no one-size-fits-all ways to do this +because it depends a lot on the specifics of the product you are +building. This is more of a product management question of how to design +your product in a way that you can capture feedback from your users as +part of the product experience.

+

Model Performance Metrics

+

The next most valuable signal to look at is model performance +metrics. These are offline metrics such as accuracy. This is less +useful than user feedback because of loss mismatch. A common experience +many ML practitioners have is that improving model performance leads to +the same or worse outcome. There's very little excuse for not doing +this. To some degree, you can label some production data each day by +setting up an on-call rotation or throwing a labeling party. These +practices will give you some sense of how your model performance trends +over time.

+

+

Proxy Metrics

+

The next best thing to look at is proxy metrics, which are +correlated with bad model performance. These are mostly domain-specific. +For example, if you are building text generation with a language model, +two examples would be repetitive and toxic outputs. If you are building +a recommendation system, an example would be the share of personalized +responses. Edge cases can be good proxy metrics. If there are +certain problems you know that you have with your model, if those +increase in prevalence, that might mean your model is not doing very +well.

+

There's an academic direction that aims at being able to take any metric +you care about and approximate it on previously unseen data. How well do +we think our model is doing on this new data? Which would make these +proxy metrics a lot more practically useful? There are a number of +different approaches here: from training an auxiliary model to predict +how well your main model might do on this offline data, to using +heuristics and human-in-the-loop methods.

+

+

An unfortunate result from this literature is that it's not possible to +have a single method you use in all circumstances to approximate how +your model is doing on out-of-distribution data. Let's say you are +looking at the input data to predict how the model will perform on those +input points. Then the label distribution changes. As a result, you +won't be able to take into account that change in your approximate +metric.

+

Data Quality

+

The next signal to look at is data quality. Data quality +testing is a set +of rules you apply to measure the quality of your data. This deals with +questions such as: How well does a piece of information reflect reality? +Does it fulfill your expectations of what's comprehensive? Is your +information available when you need it? Some common examples include +checking whether the data has the right schema, the data is in the +expected range, and the number of records is not anomalous.

+

+

This is useful because data problems tend to be the most common issue +with ML models in practice. In a Google +report +which covered 15 years of different pipeline outages with a particular +ML model, most of the outages that happened with that model were +distributed systems problems, commonly data problems.

+

Distribution Drift

+
Why Measure Distribution Drift?
+

Your model's performance is only guaranteed on data sampled from the +same distribution as it was trained on. This can have a huge impact in +practice. A recent example includes changes in model behavior during the +pandemic. A bug in the retraining pipeline caused the recommendations +not to be updated for new users, leading to millions of dollars in +revenue lost.

+
Types of Distribution Drift
+

Distribution drift manifests itself in different ways in the wild:

+
    +
  1. +

    Instantaneous drift happens when a model is deployed in a new +domain, a bug is introduced in the pre-processing pipeline, or a +big external shift like COVID occurs.

    +
  2. +
  3. +

    Gradual drift happens when users\' preferences change or new +concepts get introduced to the corpus over time.

    +
  4. +
  5. +

    Periodic drift happens when users' preferences are seasonal or +people in different time zones use your model differently.

    +
  6. +
  7. +

    Temporary drift happens when a malicious user attacks your +model, a new user tries your product and churns, or someone uses +your product in an unintended way.

    +
  8. +
+
How to Measure It?
+

How to tell if your distribution is drifted?

+
    +
  1. +

    Your first select a window of "good" data to serve as a +reference. To select that reference, you can use a fixed window +of production data you believe to be healthy. Some +papers advocate +for using a sliding window of production data. In practice, most +of the time you probably should use your validation data as the +reference.

    +
  2. +
  3. +

    Once you have that reference data, you select a new window of +production data to measure your distribution distance on. This +is not a super principled approach and tends to be +problem-dependent. A pragmatic solution is to pick one or several +window sizes with a reasonable amount of data and slide them.

    +
  4. +
  5. +

    Finally, once you have your reference window and production window, +you compare the windows using a distribution distance metric.

    +
  6. +
+
What Metrics To Use?
+

Let's start by considering the one-dimensional case, where you have a +particular feature that is one-dimensional and can compute a density of +that feature on your reference/production windows. You want some metric +that approximates the distance between these two distributions.

+

+

There are a few options here:

+
    +
  1. +

    The commonly recommended ones are the KL divergence and the KS test. +But they are actually bad choices.

    +
  2. +
  3. +

    Sometimes-better options would be (1) infinity norm or 1-norm of the +diff between probabilities for each category, and (2) +Earth-mover's distance (a more statistically principled approach).

    +
  4. +
+

Check out this Gantry blog +post +to learn more about why the commonly recommended metrics are not so good +and the other ones are better.

+
Dealing with High-Dimensional Data
+

In the real world for most models, we have potentially many input +features or even unstructured data that is very high-dimensional. How do +we deal with detecting distribution drift in those cases?

+
    +
  1. +

    You can measure drift on all of the features independently: If +you have a lot of features, you will hit the multiple hypothesis +testing +problem. +Furthermore, this doesn't capture cross-correlation.

    +
  2. +
  3. +

    You can measure drift on only the important features: Generally +speaking, it's a lot more useful to measure drift on the outputs +of the model than the inputs. You can also rank the importance +of your input +features +and measure drift on the most important ones.

    +
  4. +
  5. +

    You can look at metrics that natively compute or approximate the +distribution distance between high-dimensional distributions: +The two that are worth checking out are maximum mean +discrepancy +and approximate Earth-mover's +distance. The +caveat here is that they are pretty hard to interpret.

    +
  6. +
+

+

A more principled way to measure distribution drift for high-dimensional +inputs to the model is to use projections. The idea of a projection +is that:

+
    +
  1. +

    You first take some high-dimensional input to the model and run that +through a function.

    +
  2. +
  3. +

    Each data point your model makes a prediction on gets tagged by this +projection function. The goal of this projection function is to +reduce the dimensionality of that input.

    +
  4. +
  5. +

    Once you've reduced the dimensionality, you can do drift detection +on that lower-dimensional representation of the high-dimensional +data.

    +
  6. +
+

This approach works for any kind of data, no matter what the +dimensionality is or what the data type is. It's also highly flexible. +There are different types of projections that can be useful: +analytical projections (e.g., mean pixel value, length of sentence, +or any other function), random projections (e.g., linear), and +statistical projections (e.g., autoencoder or other density models, +T-SNE).

+
Cons of Looking at Distribution Drift
+

+

Models are designed to be robust to some degree of distribution +drift. The figure on the left above shows a toy example to demonstrate +this point. We have a classifier that's trained to predict two classes. +We've induced a synthetic distribution shift to shift the red points on +the top left to bottom. These two distributions are extremely different, +but the model performs equally well on the training data and the +production data. In other words, knowing the distribution shift doesn't +tell you how the model has reacted to that shift.

+

The figure on the right is a research project that used data generated +from a physics simulator to solve problems on real-world robots. The +training data was highly out of distribution (low-fidelity, random +images). However, by training on this set of training data, the model +was able to generalize to real-world scenarios on the test data.

+

Beyond the theoretical limitations of measuring distribution drift, this +is just hard to do in practice. You have to window size correctly. You +have to keep all this data around. You need to choose metrics. You need +to define projections to make your data lower-dimensional.

+

System Metrics

+

The last thing to consider looking at is your standard system +metrics such as CPU utilization, GPU memory usage, etc. These don't +tell you anything about how your model is actually performing, but they +can tell you when something is going wrong.

+

Practical Recommendations

+

We also want to look at how hard it is to compute the aforementioned +stages in practice. As seen below, the Y-axis shows the value of +each signal and the X-axis shows the feasibility of measuring each +signal.

+
    +
  1. +

    Measuring outcomes or feedback has pretty wide variability in terms +of how feasible it is to do, as it depends on how your product is +set up.

    +
  2. +
  3. +

    Measuring model performance tends to be the least feasible thing to +do because it involves collecting some labels.

    +
  4. +
  5. +

    Proxy metrics are easier to compute because they don't involve +labels.

    +
  6. +
  7. +

    System metrics and data quality metrics are highly feasible because +you have off-the-shelf tools for them.

    +
  8. +
+

+

Here are our practical recommendations:

+
    +
  1. +

    Basic data quality checks are zero-regret, especially if you are +retraining your model.

    +
  2. +
  3. +

    Get some way to measure feedback, model performance, or proxy +metrics, even if it's hacky or not scalable.

    +
  4. +
  5. +

    If your model produces low-dimensional outputs, monitoring those for +distribution shifts is also a good idea.

    +
  6. +
  7. +

    As you evolve your system, practice the observability mindset.

    +
  8. +
+

While you can think of monitoring as measuring the known unknowns (e.g., +setting alerts on a few key metrics), observability is measuring +unknown +unknowns +(e.g., having the power to ask arbitrary questions about your system +when it breaks). An observability mindset means two implications:

+
    +
  1. +

    You should keep around the context or raw data that makes up the +metrics that you are computing since you want to be able to drill +all the way down to potentially the data points themselves that +make up the degraded metric.

    +
  2. +
  3. +

    You can go crazy with measurement by defining a lot of different +metrics. You shouldn't necessarily set alerts on each of those +since you don't want too many alerts. Drift is a great example +since it is useful for debugging but less so for monitoring.

    +
  4. +
+

Finally, it's important to go beyond aggregate metrics. If your +model is 99% accurate in aggregate but only 50% accurate for your most +important user, is it still "good"? The way to deal with this is by +flagging important subgroups or cohorts of data and alerting on +important metrics across them. Some examples are categories you don't +want to be biased against, "important" categories of users, and +categories you might expect to perform differently on (languages, +regions, etc.).

+

How To Tell If Those Metrics are "Bad"

+

We don't recommend statistical tests (e.g., KS-Test) because they try to +return a p-value for the likelihood that the data distributions are not +the same. When you have a lot of data, you will get very small p-values +for small shifts. This is not what we actually care about since models +are robust to a small number of distribution shifts.

+

Better options than statistical tests include fixed rules, specific +ranges, predicted ranges, and unsupervised detection of new patterns. +This article on dynamic data +testing +has the details.

+

+

Tools for Monitoring

+

The first category is system monitoring tools, a premature category +with different companies in it +(Datadog, +Honeycomb, New +Relic, Amazon +CloudWatch, etc.). +They help you detect problems with any software system, not just ML +models. They provide functionality for setting alarms when things go +wrong. Most cloud providers have decent monitoring solutions, but if you +want something better, you can look at monitoring-specific tools to +monitor anything.

+

This raises the question of whether we should just use these system +monitoring tools to monitor ML metrics as well. This blog +post +explains that it's feasible but highly painful due to many technical +reasons. Thus, it's better to use ML-specific tools.

+

Two popular open-source monitoring tools are +EvidentlyAI and +whylogs.

+
    +
  • +

    Both are similar in that you provide them with samples of data and +they produce a nice report that tells you where their distribution +shifts are.

    +
  • +
  • +

    The big limitation of both is that they don't solve the data +infrastructure and the scale problem. You still need to be able to +get all that data into a place where you can analyze it with these +tools.

    +
  • +
  • +

    The main difference between them is that whylogs is more focused on +gathering data from the edge by aggregating the data into +statistical profiles at inference time. You don't need to +transport all the data from your inference devices back to your +cloud.

    +
  • +
+

+

Lastly, there are a bunch of different SaaS vendors for ML monitoring +and observability: Gantry, +Aporia, +Superwise, +Arize, +Fiddler, +Arthur, etc.

+

5 - Retraining Strategy

+

We’ve talked about monitoring and observability, which allow you to identify issues with your continual learning system. Now, we’ll talk about how we will fix the various stages of the continual learning process based on what we learn from monitoring and observability.

+

Logging

+

The first stage of the continual learning loop is logging. As a reminder, the goal of logging is to get data from your model to a place where you can analyze it. The key question to answer here is: “what data should I actually log?

+

For most of us, the best answer is just to log all of the data. Storage is cheap. It's better to have data than not to have it. There are, however, some situations where you can't do that. For example, if you have too much traffic going through your model to the point where it's too expensive to log all of it, or if you have data privacy concerns, or if you're running your model at the edge, you simply may not be able to log all your data.

+

In these situations, there are two approaches that you can take. The first approach is profiling. With profiling, rather than sending all the data back to your cloud and then using that to monitor, you instead compute statistical profiles of your data on the edge that describe the data distribution that you're seeing. This is great from a data security perspective because it doesn't require you to send all the data back home. It minimizes your storage cost. Finally, you don't miss things that happen in the tails, which is an issue for the next approach. That'll describe the place to use. This approach is best used for security-critical applications. Computing statistical profiles is a pretty interesting topic in computer science and data summarization that is worth checking out if you’re interested in this approach.

+

alt_text

+

The other approach is sampling. With sampling, you'll just take certain data points and send those back to your monitoring and logging system. The advantage of sampling is that it has minimal impact on your inference resources. You don't have to actually spend the computational budget to compute profiles. You also get to have access to the raw data for debugging and retraining, albeit a smaller amount. This is the approach we recommend for any other kind of application.

+

Curation

+

The next step in the continual learning loop is curation. The goal of curation is to take the infinite stream of production data, which is potentially unlabeled, and turn it into a finite reservoir of enriched data suitable for training. Here, we must answer, “what data should be enriched?

+

You could sample and enrich data randomly, but that may not prove helpful to your model. Importantly, you miss rare classes or events. A better approach can be to perform stratified subsampling, wherein you sample specific proportions of individuals from various subpopulations (e.g. race). The most advanced strategy for picking data to enrich is to curate data points that are somehow interesting for the purpose of improving your model.

+

There are a few different ways of doing this: user-driven curation loops via feedback loops, manual curation via error analysis, and automatic curation via active learning.

+

User-driven curation is a great approach that is easy to implement, assuming you have a clear way of gathering user feedback. If your user churns, clicks thumbs down, or performs some other similar activity on the model’s output, you have an easy way of understanding data that could be enriched for future training jobs.

+

alt_text

+

If you don't have user feedback, or if you need even more ways of gathering interesting data from your system, the second most effective way is by doing manual error analysis. In this approach, we look at the errors that our model is making, reason about the different types of failure modes that we're seeing, and try to write functions or rules that help capture these error modes. We'll use those functions to gather more data that might represent those error cases. Some examples of these function-based approaches are similarity-based curation, which uses nearest neighbors, and projection-based curation, wherein we train a new function or model to recognize key data points.

+

The last way to curate data is to do so automatically using a class of algorithms called active learning. The way active learning works is that, given a large amount of unlabeled data, we will try to determine which data points would improve model performance the most (if you were to label those data points next and train on them). These algorithms define a sampling strategy, rank all of your unlabeled examples using a scoring function that defines the sampling strategy, and mark the data points with the highest scores for future labeling.

+

There are a number of different scoring function approaches that are shown below.

+
    +
  1. Most uncertain: sample low-confidence and high-entropy predictions or predictions that an ensemble disagrees on.
  2. +
  3. Highest predicted loss: train a separate model that predicts loss on unlabeled points, then sample the highest predicted loss.
  4. +
  5. Most different from labels: train a model to distinguish labeled and unlabeled data, then sample the easiest to distinguish.
  6. +
  7. Most representative: choose points such that no data is too far away from anything we sampled.
  8. +
  9. Big impact on training: choose points such that the expected gradient is large or points where the model changes its mind the most about its prediction during training.
  10. +
+

Uncertainty scoring tends to be the most commonly used method since it is simple and easy to implement.

+

You might have noticed that there's a lot of similarity between some of the ways that we do data curation and the way that we do monitoring. That's no coincidence--monitoring and data curation are two sides of the same coin! They're both interested in solving the problem of finding data points where the model may not be performing well or where we're uncertain about how the model is performing on those data points.

+

alt_text

+

Some examples of people practically applying data curation are OpenAI’s DALL-E 2, which uses active learning and manual curation, Tesla, which uses feedback loops and manual curation, and Cruise, which uses feedback loops.

+

Some tools that help with data curation are Scale Nucleus, Aquarium, and Gantry.

+

To summarize then, here are our final set of recommendations for applying data curation.

+
    +
  1. Random sampling is a fine starting point. If you want to avoid bias or have rare classes, do stratified sampling instead.
  2. +
  3. If you have a feedback loop, then user-driven curation is a no-brainer. If not, confidence-based active learning is easy to implement.
  4. +
  5. As your model performance increases, you’ll have to look harder for challenging training points. Manual techniques are unavoidable and should be embraced. Know your data!
  6. +
+

Retraining Triggers

+

After we've curated our infinite stream of unlabeled data down to a reservoir of labeled data that's ready to potentially train on, the next thing that we'll need to decide is “what trigger are we gonna use to retrain?

+

The main takeaway here is that moving to automated retraining is not always necessary. In many cases, just manually retraining is good enough. It can save you time and lead to better model performance. It's worth understanding when it makes sense to actually make the harder move to automated retraining.

+

The main prerequisite for moving to automated retraining is being able to reproduce model performance when retraining in a fairly automated fashion. If you're able to do that and you are not really working on the model actively, it's probably worth implementing some automated retraining. As a rule of thumb, if you’re retraining the model more than once a month, automated retraining may make sense.

+

When it's time to move to automated training, the main recommendation is to just keep it simple and retrain periodically, e.g. once a week. The main question though is, how do you pick the right training schedule? The recommendation here is to:

+
    +
  1. Apply measurement to figure out a reasonable retraining schedule.
  2. +
  3. Plot your model performance and degradation over time.
  4. +
  5. Compare how retraining the model at various intervals would have resulted in improvements to its performance.
  6. +
+

As seen below, the area between the curves represents the opportunity cost, so always remember to balance the upside of retraining with the operational costs of retraining.

+

alt_text

+

This is a great area for future academic research! More specifically, we can look at ways to automate determining the optimal retraining strategy based on performance decay, sensitivity to performance, operational costs, and retraining costs.

+

An additional option for retraining, rather than time-based intervals, is performance triggers (e.g. retrain when the model accuracy dips below 90%). This helps react more quickly to unexpected changes and is more cost-optimal, but requires very good instrumentation to process these signals along with operational complexity.

+

An idea that probably won't be relevant but is worth thinking about is online learning. In this paradigm, you train on every single data point as it comes in. It's not very commonly used in practice.

+

A version of this idea that is used fairly frequently in practice is online adaptation. This method operates not at the level of retraining the whole model itself but rather on the level of adapting the policy that sits on top of the model. What is a policy you ask? A policy is the set of rules that takes the raw prediction that the model made, like the score or the raw output of the model, and turns it into the output the user sees. In online adaptation, we use algorithms like multi-armed bandits to tune these policies. If your data changes very frequently, it is worth looking into this method.

+

Dataset Formation

+

Imagine we've fired off a trigger to start a new training job. The next question we need to answer is, among all of the labeled data in our reservoir of data, what specific data points should we train on for this particular new training job?

+

We have four options here. Most of the time in deep learning, we'll just use the first option and train on all the data that we have available to us. Remember to keep your data version controlled and your curation rules consistent.

+

alt_text

+

If you have too much data to do that, you can use recency as a heuristic for a second option and train on only a sliding window of the most recent data (if recency is important) or sample a smaller portion (if recency isn’t). In the latter case, compare the aggregate statistics between the old and new windows to ensure there aren’t any bugs. It’s also important in both cases to compare the old and new datasets as they may not be related in straightforward ways.

+

alt_text

+

A useful third option is online batch selection, which can be used when recency doesn’t quite matter. In this method, we leverage label-aware selection functions to choose which items in mini-batches to train on.

+

alt_text

+

A more difficult fourth option that isn’t quite recommended is continual fine-tuning. Rather than retraining from scratch every single time, you train your existing model on just new data. The reason why you might wanna do this primarily is because it's much more cost-effective. The paper below shares some findings from GrubHub, where they found a 45x cost improvement by doing this technique relative to sliding windows.

+

alt_text

+

The big challenge here is that unless you're very careful, it's easy for the model to forget what it learned in the past. The upshot is that you need to have mature evaluation practices to be very careful that your model is performing well on all the types of data that it needs to perform well on.

+

Offline Testing

+

After the previous steps, we now have a new candidate model that we think is ready to go into production. The next step is to test that model. The goal of this stage is to produce a report that our team can sign off on that answers the question of whether this new model is good enough or whether it's better than the old model. The key question here is, “what should go into that report?

+

This is a place where there's not a whole lot of standardization, but the recommendation we have here is to compare your current model with the previous version of the model on all of the metrics that you care about, all of the subsets of data that you've flagged are important, and all the edge cases you’ve defined. Remember to adjust the comparison to account for any sampling bias.

+

Below is a sample comparison report. Note how the validation set is broken out into concrete subgroups. Note also how there are specific validation sets assigned to common error cases.

+

alt_text

+

In continual learning, evaluation sets are dynamically refined just as much as training sets are. Here are some guidelines for how to manage evaluation sets in a continual learning system:

+
    +
  1. As you curate new data, add some of it to your evaluation sets. For example, if you change how you do sampling, add that newly sampled data to your evaluation set. Or if you encounter a new edge case, create a test case for it.
  2. +
  3. Corollary 1: you should version control your evaluation sets as well.
  4. +
  5. Corollary 2: if your data changes quickly, always hold out the most recent data for evaluation.
  6. +
+

Once you have the testing basics in place, a more advanced option that you can look into here is expectation testing. Expectation tests work by taking pairs of examples where you know the relationship between the two. These tests help a lot with understanding the generalizability of models.

+

alt_text

+

Just like how data curation is highly analogous to monitoring, so is offline testing. We want to observe our metrics, not just in aggregate but also across all of our important subsets of data and across all of our edge cases. One difference between these two is that you will have different metrics available in offline testing and online testing. For example, you’re much more likely to have labels offline. Online, you’re much more likely to have feedback. We look forward to more research that can predict online metrics from offline ones.

+

Online Testing

+

Much of this we covered in the last lecture, so we’ll keep it brief! Use shadow mode and A/B tests, roll out models gradually, and roll back models if you see issues during rollout.

+

6 - The Continual Improvement Workflow

+

To tie it all together, we’ll conclude with an example. Monitoring and continual learning are two sides of the same coin. We should be using the signals that we monitor to very directly change our retraining strategy. This section describes the future state that comes as a result of investing in the steps laid out previously.

+

Start with a place to store and version your strategy. The components of your continual learning strategy should include the following:

+
    +
  • Inputs, predictions, user feedback, and labels.
  • +
  • Metric definitions for monitoring, observability, and offline testing.
  • +
  • Projection definitions for monitoring and manual data curation.
  • +
  • Subgroups and cohorts of interest for monitoring and offline testing.
  • +
  • Data curation logic.
  • +
  • Datasets for training and evaluation.
  • +
  • Model comparison reports.
  • +
+

Walk through this example to understand how changes to the retraining strategy occur as issues surface in our machine learning system.

+

alt_text

+

7 - Takeaways

+

To summarize, continual learning is a nascent, poorly understood topic that is worth continuing to pay attention to. Watch this space! In this lecture, we focused on all the steps and techniques that allow you to use retraining effectively. As MLEs, leverage monitoring to strategically improve your model. Always start simple, and get better!

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-6-continual-learning/media/image1.png b/course/2022/lecture-6-continual-learning/media/image1.png new file mode 100644 index 00000000..20cfa905 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image1.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image10.png b/course/2022/lecture-6-continual-learning/media/image10.png new file mode 100644 index 00000000..b06421e3 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image10.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image11.png b/course/2022/lecture-6-continual-learning/media/image11.png new file mode 100644 index 00000000..43c23aa3 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image11.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image12.png b/course/2022/lecture-6-continual-learning/media/image12.png new file mode 100644 index 00000000..be18dd18 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image12.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image13.png b/course/2022/lecture-6-continual-learning/media/image13.png new file mode 100644 index 00000000..0016972e Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image13.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image14.png b/course/2022/lecture-6-continual-learning/media/image14.png new file mode 100644 index 00000000..9de7cbfe Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image14.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image15.png b/course/2022/lecture-6-continual-learning/media/image15.png new file mode 100644 index 00000000..9e2be1ac Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image15.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image16.png b/course/2022/lecture-6-continual-learning/media/image16.png new file mode 100644 index 00000000..ba4fa618 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image16.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image17.png b/course/2022/lecture-6-continual-learning/media/image17.png new file mode 100644 index 00000000..ba1adff5 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image17.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image18.png b/course/2022/lecture-6-continual-learning/media/image18.png new file mode 100644 index 00000000..0257b922 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image18.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image19.png b/course/2022/lecture-6-continual-learning/media/image19.png new file mode 100644 index 00000000..98bc3ca6 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image19.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image2.png b/course/2022/lecture-6-continual-learning/media/image2.png new file mode 100644 index 00000000..104f3ace Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image2.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image20.png b/course/2022/lecture-6-continual-learning/media/image20.png new file mode 100644 index 00000000..68d33e6f Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image20.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image21.png b/course/2022/lecture-6-continual-learning/media/image21.png new file mode 100644 index 00000000..08d27308 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image21.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image22.png b/course/2022/lecture-6-continual-learning/media/image22.png new file mode 100644 index 00000000..1e75f535 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image22.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image23.png b/course/2022/lecture-6-continual-learning/media/image23.png new file mode 100644 index 00000000..d6c9a7e8 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image23.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image24.png b/course/2022/lecture-6-continual-learning/media/image24.png new file mode 100644 index 00000000..8a948c9e Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image24.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image25.png b/course/2022/lecture-6-continual-learning/media/image25.png new file mode 100644 index 00000000..16276375 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image25.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image26.png b/course/2022/lecture-6-continual-learning/media/image26.png new file mode 100644 index 00000000..b3f681e4 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image26.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image27.png b/course/2022/lecture-6-continual-learning/media/image27.png new file mode 100644 index 00000000..caa64e56 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image27.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image28.png b/course/2022/lecture-6-continual-learning/media/image28.png new file mode 100644 index 00000000..d7cb52c9 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image28.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image29.png b/course/2022/lecture-6-continual-learning/media/image29.png new file mode 100644 index 00000000..072410b9 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image29.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image3.png b/course/2022/lecture-6-continual-learning/media/image3.png new file mode 100644 index 00000000..88d261fa Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image3.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image30.png b/course/2022/lecture-6-continual-learning/media/image30.png new file mode 100644 index 00000000..d90e7dbf Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image30.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image31.png b/course/2022/lecture-6-continual-learning/media/image31.png new file mode 100644 index 00000000..e76767a2 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image31.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image32.png b/course/2022/lecture-6-continual-learning/media/image32.png new file mode 100644 index 00000000..7b0fdded Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image32.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image4.png b/course/2022/lecture-6-continual-learning/media/image4.png new file mode 100644 index 00000000..7aa5bbce Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image4.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image5.png b/course/2022/lecture-6-continual-learning/media/image5.png new file mode 100644 index 00000000..5482e369 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image5.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image6.png b/course/2022/lecture-6-continual-learning/media/image6.png new file mode 100644 index 00000000..f421e2fa Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image6.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image7.png b/course/2022/lecture-6-continual-learning/media/image7.png new file mode 100644 index 00000000..3cf0b500 Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image7.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image8.png b/course/2022/lecture-6-continual-learning/media/image8.png new file mode 100644 index 00000000..17c3d3bb Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image8.png differ diff --git a/course/2022/lecture-6-continual-learning/media/image9.png b/course/2022/lecture-6-continual-learning/media/image9.png new file mode 100644 index 00000000..8fd2252e Binary files /dev/null and b/course/2022/lecture-6-continual-learning/media/image9.png differ diff --git a/course/2022/lecture-7-foundation-models/index.html b/course/2022/lecture-7-foundation-models/index.html new file mode 100644 index 00000000..6febc369 --- /dev/null +++ b/course/2022/lecture-7-foundation-models/index.html @@ -0,0 +1,2641 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 7: Foundation Models - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 7: Foundation Models

+
+ +
+ +

Lecture by Sergey Karayev. +Notes by James Le and Vishnu Rachakonda.
+Published September 19, 2022. +Download slides.

+

Foundation models are very large models trained on very large datasets that +can be used for multiple downstream tasks.

+

We’ll talk about fine-tuning, Transformers, large language models, prompt engineering, other applications of large models, and vision and text-based models like CLIP and image generation.

+

alt_text

+

1 - Fine-Tuning

+

Traditional ML uses a lot of data and a large model, which takes a long time. But if you have a small amount of data, you can use transfer learning to benefit from the training on a lot of data. You basically use the same model that you have pre-trained, add a few layers, and unlock some weights.

+

We have been doing this in computer vision since 2014. Usually, you train a model on ImageNet, keep most of the layers, and replace the top three or so layers with newly learned weights. Model Zoos are full of these models like AlexNet, ResNet, etc. in both TensorFlow and PyTorch.

+

In NLP, pre-training was initially limited only to the first step: word embeddings. The input to a language model is words. One way you can encode them to be a vector (instead of a word) is one-hot encoding. Given a large matrix of words, you can make an embedding matrix and embed each word into a real-valued vector space. This new matrix is down to the dimension on the order of a thousand magnitude. Maybe those dimensions correspond to some semantic notion.

+

alt_text

+

Word2Vec trained a model like this in 2013. It looked at which words frequently co-occur together. The learning objective was to maximize cosine similarity between their embeddings. It could do cool demos of vector math on these embeddings. For example, when you embed the words “king,” “man,” and “woman,” you can do vector math to get a vector that is close to the word “queen” in this embedding space.

+

It’s useful to see more context to embed words correctly because words can play different roles in the sentence (depending on their context). If you do this, you’ll improve accuracy on all downstream tasks. In 2018, a number of models such as ELMO and ULMFit published pre-trained LSTM-based models that set state-of-the-art results on most NLP tasks.

+

But if you look at the model zoos today, you won’t see any LSTMs. You’ll only see Transformers everywhere. What are they?

+

2 - Transformers

+

Transformers come from a paper called “Attention Is All You Need” in 2017, which introduced a groundbreaking architecture that sets state-of-the-art results on translation first and a bunch of NLP tasks later.

+

alt_text

+

It has a decoder and an encoder. For simplicity, let’s take a look at the encoder. The interesting components here are self-attention, positional encoding, and layer normalization.

+

Self-Attention

+

alt_text

+

Basic self-attention follows: Given an input sequence of vectors x of size t, we will produce an output sequence of tensors of size t. Each tensor is a weighted sum of the input sequence. The weight here is just a dot product of the input vectors. All we have to do is to make that weighted vector sum to 1. We can represent it visually, as seen below. The input is a sentence in English, while the output is a translation in French.

+

alt_text

+

So far, there are no learned weights and no sequence order. Let’s learn some weights! If we look at the input vectors, we use them in three ways: as queries to compare two other input vectors, as keys to compare them to input vectors and produce the corresponding output vector, and as values to sum up all the input vectors and produce the output vector. +* We can process each input vector with three different matrices to fulfill these roles of query, key, and value. We will have three weighted matrices, and everything else remains the same. If we learn these matrices, we learn attention. +* It’s called multi-head attention *because we learn different sets of weighted matrices simultaneously, but we implement them as just a single matrix.

+

So far, we have learned the query, key, and value. Now we need to introduce some notion of order to the sequence by encoding each vector with its position. This is called positional encoding.

+

Positional Encoding

+

alt_text

+

Let’s say we have an input sequence of words

+

]* The first step is to embed the words into a dense, real-valued word embedding. This part can be learned. +* However, there is no order to that embedding. Thus, we will add another embedding that only encodes the position. +* In brief, the first embedding encodes only the content, while the second embedding encodes only the position. If you add them, you now have information about both the content and the position.

+

Layer Normalization

+

alt_text

+

Neural network layers work best when the input vectors have uniform mean and standard deviation in each dimension. As activations flow through the network, the means and standard deviations get blown out by the weight matrices. Layer normalization is a hack to re-normalize every activation to where we want them between each layer.

+

That’s it! All the amazing results you’ll see from now on are just increasingly large Transformers with dozens of layers, dozens of heads within each layer, large embedding dimensions, etc. The fundamentals are the same. It’s just the Transformer model.

+

Anthropic has been publishing great work lately to investigate why Transformers work so well. Check out these publications:

+
    +
  1. A Mathematical Framework for Transformer Circuits
  2. +
  3. In-Context Learning and Induction Heads
  4. +
  5. Toy Models of Superposition
  6. +
+

3 - Large Language Models

+

Models

+

GPT and GPT-2 came out in 2018 and 2019, respectively. The name means “generative pre-trained Transformers.” They are decoder-only models and use masked self-attention. This means: At a poi that at the output sequence, you can only attend to two input sequence vectors that came before that point in the sequence.

+

alt_text

+

These models were trained on 8 million web pages. The largest model has 1.5 billion parameters. The task that GPT-2 was trained on is predicting the next word in all of this text on the web. They found that it works increasingly well with an increasing number of parameters.

+

alt_text

+

BERT came out around the same time as Bidirectional Encoder Representations for Transformers. It is encoder-only and does not do attention masking. It has 110 million parameters. During training, BERT masks out random words in a sequence and has to predict whatever the masked word is.

+

alt_text

+

T5 (Text-to-Text Transformer) came out in 2020. The input and output are both text strings, so you can specify the task that the model supposes to be doing. T5 has an encoder-decoder architecture. It was trained on the C4 dataset (Colossal Clean Crawled Corpus), which is 100x larger than Wikipedia. It has around 10 billion parameters. You can download the open-sourced model and run it on your machine.

+

GPT-3 was one of the state-of-the-art models in 2020. It was 100x larger than GPT/GPT-2 with 175 billion parameters. Because of its size, GPT-3 exhibits unprecedented capabilities of few-shot and zero-shot learning. As seen in the graph below, the more examples you give the model, the better its performance is. The larger the model is, the better its performance is. If a larger model was trained, it would be even better.

+

alt_text

+

OpenAI also released Instruct-GPT earlier this year. It had humans rank different GPT-3 outputs and used reinforcement learning to fine-tune the model. Instruct-GPT was much better at following instructions. OpenAI has put this model, titled ‘text-davinci-002,’ in their API. It is unclear how big the model is. It could be ~10x smaller than GPT-3.

+

alt_text

+

DeepMind released RETRO (Retrieval-Enhanced Transformers) in 2021. Instead of learning language and memorizing facts in the model’s parameters, why don’t we just learn the language in parameters and retrieve facts from a large database of internal text? To implement RETRO, they encode a bunch of sentences with BERT and store them in a huge database with more than 1 trillion tokens. At inference time, they fetch matching sentences and attend to them. This is a powerful idea because RETRO is connected to an always updated database of facts.

+

alt_text

+

DeepMind released another model called Chinchilla in 2022, which observed the scaling laws of language models. They trained over 400 language models from 70 million to 16 billion parameters on 5 billion to 500 billion tokens. They then derived formulas for optimal model and training set size, given a fixed compute budget. They found that most large language models are “undertrained,” meaning they haven’t seen enough data.

+

alt_text

+

To prove this, they trained a large model called Gopher with 280 billion parameters and 300 billion tokens. With Chincilla, they reduced the number of parameters to 70 billion and used four times as much data (1.4 trillion tokens). Chinchilla not only matched Gopher’s performance but actually exceeded it. Check out this LessWrong post if you want to read about people’s opinions on it.

+

Vendors

+

OpenAI offers four model sizes: Ada, Babbage, Curie, and Davinci. Each has a different price and different capabilities. Most of the impressive GPT-3 results on the Internet came from Davinci. These correspond to 350M, 1.3B, 6.7B, and 175B parameters. You can also fine-tune models for an extra cost. The quota you get when you sign up is pretty small, but you can raise it over time. You have to apply for review before going into production.

+

There are some alternatives to OpenAI:

+
    +
  1. Cohere AI has similar models for similar prices.
  2. +
  3. AI21 also has some large models.
  4. +
  5. There are also open-source large language models, such as Eleuther GPT-NeoX (20B parameters), Facebook OPT-175B (175B parameters), and BLOOM from BigScience (176B parameters). If you want to use one of these open-source models but do not have to be responsible for deploying it, you can use HuggingFace’s inference API.
  6. +
+

4 - Prompt Engineering

+

GPT-3 and other large language models are mostly alien technologies. It’s unclear how they exactly work. People are finding out how they work by playing with them. We will cover some notable examples below. Note that if you play around with them long enough, you are likely to discover something new.

+

GPT-3 is surprisingly bad at reversing words due to tokenization: It doesn’t see letters and words as humans do. Instead, it sees “tokens,” which are chunks of characters. Furthermore, it gets confused with long-ish sequences. Finally, it has trouble merging characters. For it to work, you have to teach GPT-3 the algorithm to use to get around its limitations. Take a look at this example from Peter Welinder.

+

alt_text

+

Another crazy prompt engineering is “Let’s Think Step By Step.” This comes from a paper called “Large Language Models are Zero-Shot Reasoners.” Simply adding “Let’s Think Step By Step” into the prompt increases the accuracy of GPT-3 on one math problem dataset from 17% to 78% and another math problem dataset from 10% to 40%.

+

alt_text

+

Another unintuitive thing is that the context length of GPT is long. You can give it a long instruction and it can return the desired output. This example shows how GPT can output a CSV file and write the Python code as stated. You can also use formatting tricks to reduce the training cost, as you can do multiple tasks per call. Take a look at this example for inspiration.

+

We have to be careful since our models might get pwnage or possessed. User input in the prompt may instruct the model to do something naughty. This input can even reveal your prompt to prompt injection attacks and possess your AI. This actually works in GPT-3-powered production apps.

+

alt_text

+

Further work is needed before putting GPT-3-powered apps into production. There are some tools for prompt engineering such as PromptSource and OpenPrompt, but we definitely need better tools.

+

5 - Other Applications

+

Code

+

alt_text

+

One notable application of large foundation models is code generation. With a 40- billion-parameter Transformer model pre-trained on all the Github code it could find, DeepMind Alphacode was able to achieve an above-average score on the Codeforce competition. To do this, they used a model to generate a large set of potential solutions and another model to winnow down the options by actually executing them.

+

The general idea to highlight from this is filtering the outputs of a model. You can have a separate model that does filtering, or you can have some kind of verification + validation process. This can really significantly boost accuracy. OpenAI demonstrates impressive results on different math word problems, as seen below.

+

alt_text

+

Code generation has moved into products of late, like Github Copilot. We highly recommend trying it out! Another option for a similar tool is replit’s new tool for coding.

+

We’re just getting started with the applications of foundation models to the programming workflow. In fact, things are about to start getting really wild. A recent paper showed that a large language model that generated its own synthetic puzzles to learn to code could improve significantly. Models are teaching themselves to get better!

+

alt_text

+

Playing around with systems like GPT-3 and their ability to generate code can feel quite remarkable! Check out some fun experiments Sergey ran (here and here).

+

alt_text

+ +

Semantic search is another interesting application area. If you have texts like words, sentences, paragraphs, or even whole documents, you can embed that text with large language models to get vectors. If you have queries in sentences or paragraphs, you can also embed them in the same way. With this function, you can generate embeddings and easily find semantic overlap by examining the cosine similarity between embedding vectors.

+

alt_text

+

Implementing this semantic search is hard. Computations on large, dense vectors with float data types are intensive. Companies like Google and Facebook that use this approach have developed libraries like FAISS and ScaNN to solve the challenges of implementing semantic search.

+

Some open-source options for this include Haystack from DeepSet and Jina.AI. Other vendor options include Pinecone, Weaviate, Milvus, Qdrant, Google Vector AI Matching Engine, etc.

+

Going Cross-Modal

+

Newer models are bridging the gap between data modalities (e.g. using both vision and text). One such model is the Flamingo model, which uses a special model component called a perceiver resampler (an attention module that translates images into fixed-length sequences of tokens).

+

alt_text

+

Another paper about Socratic Models was recently published. The author trained several large models (a vision model, a language model, and an audio model) that are able to interface with each other using language prompts to perform new tasks.

+

Finally, the concept of “Foundation Models” came from the paper “On the Opportunities and Risks of Foundation Models” by researchers at Stanford Institute for Human-Centered AI. We think “Large Language Models” or “Large Neural Networks” might be more useful terms.

+

6 - CLIP and Image Generation

+

Now, let's talk about some of the most exciting applications of this kind of model: in vision!

+

In a 2021 OpenAI paper called “Learning transferrable visual models from natural language supervision”, CLIP (Contrastive Language–Image Pre-training) was introduced. In this paper, the authors encode text via Transforms, encode images via ResNets or Visual Transformers, and apply contrastive training to train the model. Contrastive training matches correct image and text pairs using cosine similarity. The code for this is tremendously simple!

+

alt_text

+

With this powerful trained model, you can map images and text using embeddings, even on unseen data. There are two ways of doing this. One is to use a “linear probe” by training a simple logistic regression model on top of the features CLIP outputs after performing inference. Otherwise, you can use a “zero-shot” technique that encodes all the text labels and compares them to the encoded image. Zero-shot tends to be better, but not always.

+

Since OpenAI CLIP was released in an open-source format, there have been many attempts to improve it, including the OpenCLIP model, which actually outperforms CLIP.

+

To clarify, CLIP doesn’t go directly from image to text or vice versa. It uses embeddings. This embedding space, however, is super helpful for actually performing searches across modalities. This goes back to our section on vector search. There are so many cool projects that have come out of these efforts! (like this and this)

+

To help develop mental models for these operations, consider how to actual perform image captioning (image -> text) and image generation (text -> image). There are two great examples of this written in the ClipCap paper. At a high level, image captioning is performed through training a separate model to mediate between a frozen CLIP, which generates a series of word embeddings, and a frozen GPT-2, which takes these word embeddings and generates texts.

+

The intermediate model is a Transformer model that gets better at modeling images and captions.

+

alt_text

+

In image generation, the most well-known approach is taken by DALL-E 2 or unCLIP. In this method, two additional components are introduced to a CLIP system, a prior that maps from text embedding to image embeddings and a decoder that maps from image embedding to image. The prior exists to solve the problem that many text captions can accurately work for an image.

+

alt_text

+

In DALL-E 2’s case, they use an approach for the prior called a diffusion model. Diffusion models are trained to denoise data effectively through training on incrementally noisy data.

+

alt_text

+

In DALL-E 2, the diffusion method is applied to the prior model, which trains its denoising approach on a sequence of encoded text, CLIP text embedding, the diffusion timestamp, and the noised CLIP embedding, all so it can predict the un-noised CLIP image embedding. In doing so, it helps us bridge the gap between the raw text caption to the model, which can be infinitely complicated and “noisy”, and the CLIP image embedding space.

+

alt_text

+

The decoder helps us go from the prior’s output of an image embedding to an image. This is a much simpler approach for us to understand. We apply a U-Net structure to a diffusion training process that is able to ultimately “de-noise” the input image embedding and output an image.

+

alt_text

+

The results of this model are incredible! You can even generate images and merge images using CLIP embeddings. There are all kinds of funky ways of playing with the embeddings to create various image outputs.

+

alt_text

+

Other models of interest are Parti and StableDiffusion.

+
    +
  • Google published Parti soon after DALL-E 2. Parti uses a VQGAN model instead of a diffusion model, where the image is represented as a sequence of high-dimensional tokens.
  • +
  • StableDiffusion has been released publicly, so definitely check it out! It uses a “latent diffusion” model, which diffuses the image in a low-dimensional latent space and decodes the image back into a pixel space.
  • +
+

alt_text

+

There has been an absolute explosion of these applications. Check out these examples on image-to-image, video generation, and photoshop plugins. The sky is the limit.

+

Prompting these models is interesting and can get pretty involved. Someday this may even be tool and code-based. You can learn from other people on Lexica and promptoMANIA.

+

It’s truly a remarkable time to be involved with AI models as they scale to new heights.

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-7-foundation-models/media/image-1.png b/course/2022/lecture-7-foundation-models/media/image-1.png new file mode 100644 index 00000000..dd2252d7 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-1.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-10.png b/course/2022/lecture-7-foundation-models/media/image-10.png new file mode 100644 index 00000000..5a0328e1 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-10.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-11.png b/course/2022/lecture-7-foundation-models/media/image-11.png new file mode 100644 index 00000000..2a96189e Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-11.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-12.png b/course/2022/lecture-7-foundation-models/media/image-12.png new file mode 100644 index 00000000..5da70620 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-12.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-13.png b/course/2022/lecture-7-foundation-models/media/image-13.png new file mode 100644 index 00000000..28750a42 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-13.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-14.png b/course/2022/lecture-7-foundation-models/media/image-14.png new file mode 100644 index 00000000..ed03adfa Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-14.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-15.jpg b/course/2022/lecture-7-foundation-models/media/image-15.jpg new file mode 100644 index 00000000..bca927b8 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-15.jpg differ diff --git a/course/2022/lecture-7-foundation-models/media/image-16.png b/course/2022/lecture-7-foundation-models/media/image-16.png new file mode 100644 index 00000000..40f349cb Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-16.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-17.png b/course/2022/lecture-7-foundation-models/media/image-17.png new file mode 100644 index 00000000..2f82f8bc Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-17.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-18.png b/course/2022/lecture-7-foundation-models/media/image-18.png new file mode 100644 index 00000000..50f1d5a3 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-18.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-19.png b/course/2022/lecture-7-foundation-models/media/image-19.png new file mode 100644 index 00000000..b257a161 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-19.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-2.png b/course/2022/lecture-7-foundation-models/media/image-2.png new file mode 100644 index 00000000..fd9305cd Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-2.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-20.png b/course/2022/lecture-7-foundation-models/media/image-20.png new file mode 100644 index 00000000..f60f9969 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-20.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-21.jpg b/course/2022/lecture-7-foundation-models/media/image-21.jpg new file mode 100644 index 00000000..bf0929e0 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-21.jpg differ diff --git a/course/2022/lecture-7-foundation-models/media/image-22.png b/course/2022/lecture-7-foundation-models/media/image-22.png new file mode 100644 index 00000000..212cdffe Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-22.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-23.png b/course/2022/lecture-7-foundation-models/media/image-23.png new file mode 100644 index 00000000..aaaf85c1 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-23.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-24.png b/course/2022/lecture-7-foundation-models/media/image-24.png new file mode 100644 index 00000000..967da4fa Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-24.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-25.png b/course/2022/lecture-7-foundation-models/media/image-25.png new file mode 100644 index 00000000..a177d10f Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-25.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-26.png b/course/2022/lecture-7-foundation-models/media/image-26.png new file mode 100644 index 00000000..fc9fccde Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-26.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-27.png b/course/2022/lecture-7-foundation-models/media/image-27.png new file mode 100644 index 00000000..ce0b29b0 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-27.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-28.png b/course/2022/lecture-7-foundation-models/media/image-28.png new file mode 100644 index 00000000..514ea08d Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-28.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-29.png b/course/2022/lecture-7-foundation-models/media/image-29.png new file mode 100644 index 00000000..1aa1bbd5 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-29.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-3.png b/course/2022/lecture-7-foundation-models/media/image-3.png new file mode 100644 index 00000000..43ae9873 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-3.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-30.png b/course/2022/lecture-7-foundation-models/media/image-30.png new file mode 100644 index 00000000..9e50bf91 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-30.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-31.png b/course/2022/lecture-7-foundation-models/media/image-31.png new file mode 100644 index 00000000..6b1d3e6b Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-31.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-4.png b/course/2022/lecture-7-foundation-models/media/image-4.png new file mode 100644 index 00000000..0038bb4b Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-4.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-5.png b/course/2022/lecture-7-foundation-models/media/image-5.png new file mode 100644 index 00000000..7aa31886 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-5.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-6.png b/course/2022/lecture-7-foundation-models/media/image-6.png new file mode 100644 index 00000000..b3a39fbf Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-6.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-7.png b/course/2022/lecture-7-foundation-models/media/image-7.png new file mode 100644 index 00000000..bbdd39d5 Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-7.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-8.png b/course/2022/lecture-7-foundation-models/media/image-8.png new file mode 100644 index 00000000..ac7bde2a Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-8.png differ diff --git a/course/2022/lecture-7-foundation-models/media/image-9.png b/course/2022/lecture-7-foundation-models/media/image-9.png new file mode 100644 index 00000000..6948955e Binary files /dev/null and b/course/2022/lecture-7-foundation-models/media/image-9.png differ diff --git a/course/2022/lecture-8-teams-and-pm/index.html b/course/2022/lecture-8-teams-and-pm/index.html new file mode 100644 index 00000000..b6fa5b41 --- /dev/null +++ b/course/2022/lecture-8-teams-and-pm/index.html @@ -0,0 +1,3409 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 8: ML Teams and Project Management - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 8: ML Teams and Project Management

+
+ +
+ +

Lecture by Josh Tobin. +Notes by James Le and Vishnu Rachakonda.
+Published September 26, 2022. +Download slides.

+

0 - Why is this hard?

+

Building any product is hard:

+
    +
  • +

    You have to hire great people.

    +
  • +
  • +

    You have to manage and develop those people.

    +
  • +
  • +

    You have to manage your team's output and make sure your vectors are +aligned.

    +
  • +
  • +

    You have to make good long-term technical choices and manage +technical debt.

    +
  • +
  • +

    You have to manage expectations from leadership.

    +
  • +
  • +

    You have to define and communicate requirements with stakeholders.

    +
  • +
+

Machine Learning (ML) adds complexity to that process:

+
    +
  • +

    ML talent is expensive and scarce.

    +
  • +
  • +

    ML teams have a diverse set of roles.

    +
  • +
  • +

    Projects have unclear timelines and high uncertainty.

    +
  • +
  • +

    The field is moving fast, and ML is the "high-interest credit card +of technical +debt."

    +
  • +
  • +

    Leadership often doesn't understand ML.

    +
  • +
  • +

    ML products fail in ways that are hard for laypeople to understand.

    +
  • +
+

In this lecture, we'll talk about:

+
    +
  1. +

    ML-related roles and their required skills.

    +
  2. +
  3. +

    How to hire ML engineers (and how to get hired).

    +
  4. +
  5. +

    How ML teams are organized and fit into the broader +organization.

    +
  6. +
  7. +

    How to manage an ML team and ML products.

    +
  8. +
  9. +

    Design considerations for ML products.

    +
  10. +
+

1 - Roles

+

Common Roles

+

Let's look at the most common ML roles and the skills they require:

+
    +
  • +

    The ML Product Manager works with the ML team, other business +functions, the end-users, and the data owners. This person designs +docs, creates wireframes, and develops a plan to prioritize and +execute ML projects.

    +
  • +
  • +

    The MLOps/ML Platform Engineer builds the infrastructure to make +models easier and more scalable to deploy. This person handles the +ML infrastructure that runs the deployed ML product using +platforms like AWS, GCP, Kafka, and other ML tooling vendors.

    +
  • +
  • +

    The ML Engineer trains and deploys prediction models. This +person uses tools like TensorFlow and Docker to work with +prediction systems running on real data in production.

    +
  • +
  • +

    The ML Researcher trains prediction models, often those that are +forward-looking or not production-critical. This person uses +libraries like TensorFlow and PyTorch on notebook environments to +build models and reports describing their experiments.

    +
  • +
  • +

    The Data Scientist is a blanket term used to describe all of the +roles above. In some organizations, this role entails answering +business questions via analytics. This person can work with +wide-ranging tools from SQL and Excel to Pandas and Scikit-Learn.

    +
  • +
+

+

Skills Required

+

What skills are needed for these roles? The chart below displays a nice +visual - where the horizontal axis is the level of ML expertise and the +size of the bubble is the level of communication and technical writing +(the bigger, the better).

+

+
    +
  • +

    The MLOps is primarily a software engineering role, which often +comes from a standard software engineering pipeline.

    +
  • +
  • +

    The ML Engineer requires a rare mix of ML and Software +Engineering skills. This person is either an engineer with +significant self-teaching OR a science/engineering Ph.D. who works +as a traditional software engineer after graduate school.

    +
  • +
  • +

    The ML Researcher is an ML expert who usually has an MS or Ph.D. +degree in Computer Science or Statistics or finishes an industrial +fellowship program.

    +
  • +
  • +

    The ML Product Manager is just like a traditional Product +Manager but with a deep knowledge of the ML development process +and mindset.

    +
  • +
  • +

    The Data Scientist role constitutes a wide range of backgrounds, +from undergraduate to Ph.D. students.

    +
  • +
+

There is an important distinction between a task ML engineer and a +platform ML engineer, coined by Shreya Shankar in this blog +post:

+
    +
  1. +

    Task ML engineers are responsible for maintaining specific ML +pipelines. They only focus on ensuring that these ML models are +healthy and updated frequently. They are often overburdened.

    +
  2. +
  3. +

    Platform ML engineers help task ML engineers automate tedious +parts of their jobs. They are called MLOps/ML Platform engineers +in our parlance.

    +
  4. +
+

2 - Hiring

+

The AI Talent Gap

+

In 2018 (when we started FSDL), the AI talent gap was the main story. +There were so few people who understood this technology, so the biggest +block for organizations was that they couldn't find people who were good +at ML.

+

In 2022, the AI talent gap persists. But it tends to be less of a +blocker than it used to be because we have had four years of folks +switching careers into ML and software engineers emerging from +undergraduate with at least a couple of ML classes under their belts.

+

The gap tends to be in folks that understand more than just the +underlying technology but also have experience in seeing how ML fails +and how to make ML successful when it's deployed. That's the reality of +how difficult it is to hire ML folks today, especially those with +production experience.

+

Sourcing

+

Because of this shallow talent pool and the skyrocketing demand, hiring +for ML positions is pretty hard. Typical ML roles come in the following +structure:

+
    +
  • +

    ML Adjacent roles: ML product manager, DevOps, Data Engineer

    +
  • +
  • +

    Core ML Roles: ML Engineer, ML Research/ML Scientist

    +
  • +
  • +

    Business analytics roles: Data Scientist

    +
  • +
+

For ML-adjacent roles, traditional ML knowledge is less important, as +demonstrated interest, conversational understanding, and experience can +help these professionals play an impactful role on ML teams. Let's focus +on how to hire for the core ML roles.

+

+

While there's no perfect way to hire ML engineers, there's +definitely a wrong way to hire them, with extensive job descriptions +that demand only the best qualifications (seen above). Certainly, there +are many good examples of this bad practice floating around.

+
    +
  • +

    Rather than this unrealistic process, consider hiring for software +engineering skills, an interest in ML, and a desire to learn. You +can always train people in the art and science of ML, especially +when they come with strong software engineering fundamentals.

    +
  • +
  • +

    Another option is to consider adding junior talent, as many recent +grads come out with good ML knowledge nowadays.

    +
  • +
  • +

    Finally, and most importantly, be more specific about what you need +the position and professional to do. It's impossible to find one +person that can do everything from full-fledged DevOps to +algorithm development.

    +
  • +
+

To hire ML researchers, here are our tips:

+
    +
  • +

    Evaluate the quality of publications, over the quantity, with an eye +toward the originality of the ideas, the execution, etc.

    +
  • +
  • +

    Prioritize researchers that focus on important problems instead of +trendy problems.

    +
  • +
  • +

    Experience outside academia is also a positive, as these researchers +may be able to transition to industry more effectively.

    +
  • +
  • +

    Finally, keep an open mind about research talent and consider +talented people without PhDs or from adjacent fields like physics, +statistics, etc.

    +
  • +
+

To find quality candidates for these roles, here are some ideas for +sourcing:

+
    +
  • +

    Use standard sources like LinkedIn, recruiters, on-campus +recruiting, etc.

    +
  • +
  • +

    Monitor arXiv and top conferences and flag the first authors of +papers you like.

    +
  • +
  • +

    Look for good implementations of papers you like.

    +
  • +
  • +

    Attend ML research conferences (NeurIPS, ICML, ICLR).

    +
  • +
+

+

As you seek to recruit, stay on top of what professionals want and make +an effort to position your company accordingly. ML practitioners want to +be empowered to do great work with interesting data. Building a culture +of learning and impact can help recruit the best talent to your team. +Additionally, sell sell sell! Talent needs to know how good your team is +and how meaningful the mission can be.

+

Interviewing

+

As you interview candidates for ML roles, try to validate your +hypotheses of their strengths while testing a minimum bar on weaker +aspects. For example, ensure ML researchers can think creatively about +new ML problems while ensuring they meet a baseline for code quality. +It's essential to test ML knowledge and software engineering skills for +all industry professionals, though the relative strengths can vary.

+

The actual ML interview process is much less well-defined than software +engineering interviews, though it is modeled off of it. Some helpful +inclusions are projects or exercises that test the ability to work with +ML-specific code, like take-home ML projects. Chip Huyen's +"Introduction to ML Interviews +Book" is a +great resource.

+

Finding A Job

+

To find an ML job, you can take a look at the following sources:

+
    +
  • +

    Standard sources such as LinkedIn, recruiters, on-campus recruiting, +etc.

    +
  • +
  • +

    ML research conferences (NeurIPS, ICLR, ICML).

    +
  • +
  • +

    Apply directly (remember, there's a talent gap!).

    +
  • +
+

Standing out for competitive roles can be tricky! Here are some tips (in +increasing order of impressiveness) that you can apply to differentiate +yourself:

+
    +
  1. +

    Exhibit ML interest (e.g., conference attendance, online course +certificates, etc.).

    +
  2. +
  3. +

    Build software engineering skills (e.g., at a well-known software +company).

    +
  4. +
  5. +

    Show you have a broad knowledge of ML (e.g., write blog posts +synthesizing a research area).

    +
  6. +
  7. +

    Demonstrate ability to get ML projects done (e.g., create side +projects, re-implement papers).

    +
  8. +
  9. +

    Prove you can think creatively in ML (e.g., win Kaggle competitions, +publish papers).

    +
  10. +
+

3 - Organizations

+

Organization Archetypes

+

There exists not yet a consensus on the right way to structure an ML +team. Still, a few best practices are contingent upon different +organization archetypes and their ML maturity level. First, let's see +what the different ML organization archetypes are.

+

Archetype 1 - Nascent and Ad-Hoc ML

+
    +
  • +

    These are organizations where no one is doing ML, or ML is done on +an ad-hoc basis. Obviously, there is little ML expertise in-house.

    +
  • +
  • +

    They are either small-to-medium businesses or less +technology-forward large companies in industries like education or +logistics.

    +
  • +
  • +

    There is often low-hanging fruit for ML.

    +
  • +
  • +

    But there is little support for ML projects, and it's challenging to +hire and retain good talent.

    +
  • +
+

Archetype 2 - ML R&D

+
    +
  • +

    These are organizations in which ML efforts are centered in the R&D +arm of the organization. They often hire ML researchers and +doctorate students with experience publishing papers.

    +
  • +
  • +

    They are larger companies in sectors such as oil and gas, +manufacturing, or telecommunications.

    +
  • +
  • +

    They can hire experienced researchers and work on long-term business +priorities to get big wins.

    +
  • +
  • +

    However, it is very difficult to get quality data. Most often, this +type of research work rarely translates into actual business +value, so usually, the amount of investment remains small.

    +
  • +
+

Archetype 3 - ML Embedded Into Business and Product Teams

+
    +
  • +

    These are organizations where certain product teams or business +units have ML expertise alongside their software or analytics +talent. These ML individuals report up to the team's +engineering/tech lead.

    +
  • +
  • +

    They are either software companies or financial services companies.

    +
  • +
  • +

    ML improvements are likely to lead to business value. Furthermore, +there is a tight feedback cycle between idea iteration and product +improvement.

    +
  • +
  • +

    Unfortunately, it is still very hard to hire and develop top talent, +and access to data and compute resources can lag. There are also +potential conflicts between ML project cycles and engineering +management, so long-term ML projects can be hard to justify.

    +
  • +
+

Archetype 4 - Independent ML Function

+
    +
  • +

    These are organizations in which the ML division reports directly to +senior leadership. The ML Product Managers work with Researchers +and Engineers to build ML into client-facing products. They can +sometimes publish long-term research.

    +
  • +
  • +

    They are often large financial services companies.

    +
  • +
  • +

    Talent density allows them to hire and train top practitioners. +Senior leaders can marshal data and compute resources. This gives +the organizations to invest in tooling, practices, and culture +around ML development.

    +
  • +
  • +

    A disadvantage is that model handoffs to different business lines +can be challenging since users need the buy-in to ML benefits and +get educated on the model use. Also, feedback cycles can be slow.

    +
  • +
+

Archetype 5 - ML-First Organizations

+
    +
  • +

    These are organizations in which the CEO invests in ML, and there +are experts across the business focusing on quick wins. The ML +division works on challenging and long-term projects.

    +
  • +
  • +

    They are large tech companies and ML-focused startups.

    +
  • +
  • +

    They have the best data access (data thinking permeates the +organization), the most attractive recruiting funnel (challenging +ML problems tends to attract top talent), and the easiest +deployment procedure (product teams understand ML well enough).

    +
  • +
  • +

    This type of organization archetype is hard to implement in practice +since it is culturally difficult to embed ML thinking everywhere.

    +
  • +
+

Team Structure Design Choices

+

Depending on the above archetype that your organization resembles, you +can make the appropriate design choices, which broadly speaking follow +these three categories:

+
    +
  1. +

    Software Engineer vs. Research: To what extent is the ML team +responsible for building or integrating with software? How +important are Software Engineering skills on the team?

    +
  2. +
  3. +

    Data Ownership: How much control does the ML team have over data +collection, warehousing, labeling, and pipelining?

    +
  4. +
  5. +

    Model Ownership: Is the ML team responsible for deploying models +into production? Who maintains the deployed models?

    +
  6. +
+

Below are our design suggestions:

+

If your organization focuses on ML R&D:

+
    +
  • +

    Research is most definitely prioritized over Software Engineering +skills. Because of this, there would potentially be a lack of +collaboration between these two groups.

    +
  • +
  • +

    ML team has no control over the data and typically will not have +data engineers to support them.

    +
  • +
  • +

    ML models are rarely deployed into production.

    +
  • +
+

If your organization has ML embedded into the product:

+
    +
  • +

    Software Engineering skills will be prioritized over Research +skills. Often, the researchers would need strong engineering +skills since everyone would be expected to product-ionize his/her +models.

    +
  • +
  • +

    ML teams generally do not own data production and data management. +They will need to work with data engineers to build data +pipelines.

    +
  • +
  • +

    ML engineers totally own the models that they deploy into +production.

    +
  • +
+

If your organization has an independent ML division:

+
    +
  • +

    Each team has a potent mix of engineering and research skills; +therefore, they work closely together within teams.

    +
  • +
  • +

    ML team has a voice in data governance discussions, as well as a +robust data engineering function.

    +
  • +
  • +

    ML team hands-off models to users but is still responsible for +maintaining them.

    +
  • +
+

If your organization is ML-First:

+
    +
  • +

    Different teams are more or less research-oriented, but in general, +research teams collaborate closely with engineering teams.

    +
  • +
  • +

    ML team often owns the company-wide data infrastructure.

    +
  • +
  • +

    ML team hands the models to users, who are responsible for operating +and maintaining them.

    +
  • +
+

The picture below neatly sums up these suggestions:

+

+

4 - Managing

+

Managing ML Teams Is Challenging

+

The process of actually managing an ML team is quite challenging for +four reasons:

+
    +
  1. +

    Engineering Estimation: It's hard to know how easy or hard an ML +project is in advance. As you explore the data and experiment with +different models, there is enormous scope for new learnings about +the problem that materially impact the timeline. Furthermore, +knowing what methods will work is often impossible. This makes it +hard to say upfront how long or how much work may go into an ML +project.

    +
  2. +
  3. +

    Nonlinear Progress: As the chart below from a blog +post +by Lukas Biewald (CEO of Weights and +Biases) shows, progress on ML +projects is unpredictable over time, even when the effort expended +grows considerably. It's very common for projects to stall for +extended periods of time.

    +
  4. +
+

+
    +
  1. +

    Cultural gaps: The relative culture of engineering and research +professionals is very different. Research tends to favor novel, +creative ideas, while engineering prefers tried and true methods +that work. As a result, ML teams often experience a clash of +cultures, which can turn toxic if not appropriately managed. A +core challenge of running ML teams is addressing the cultural +barriers between ML and software engineering so that teams can +harmoniously experiment and deliver ML products.

    +
  2. +
  3. +

    Leadership Deficits: It's common to see a lack of detailed +understanding of ML at senior levels of management in many +companies. As a result, expressing feasibility and setting the +right expectations for ML projects, especially high-priority ones, +can be hard.

    +
  4. +
+

How To Manage ML Teams Better

+

Managing ML teams is hardly a solved problem, but you can take steps to +improve the process.

+

Plan probabilistically

+

Many engineering projects are managed in a waterfall fashion, with the +sequential tasks defined up front clearly. Instead of forcing this +method of engineering management on difficult ML projects, try assigning +a likelihood of success to different tasks to better capture the +experimental process inherent to ML engineering. As these tasks progress +or stall, rapidly re-evaluate your task ordering to better match what is +working. Having this sense of both (1) how likely a task is to +succeed and (2) how important it is makes project planning +considerably more realistic.

+

+

Have a portfolio of approaches

+

Embrace multiple ideas and approaches to solve crucial research +challenges that gate production ML. Don't make your plan dependent on +one approach working!

+

Measure inputs, not results

+

As you work through several approaches in your portfolio, do not overly +emphasize whose ideas ultimately work as a reflection of contribution +quality. This can negatively impact team members' creativity, as they +focus more on trying to find only what they currently think could work, +rather than experimenting in a high-quality fashion (which is ultimately +what leads to ML success).

+

Have researchers and engineers work together

+

The collaboration between engineering and research is essential for +quality ML products to get into production. Emphasize collaboration +across the groups and professionals!

+

Get quick wins

+

Taking this approach makes it more likely that your ML project will +succeed in the long term. It allows you to demonstrate progress to your +leadership more effectively and clearly.

+

Educate leadership on uncertainty

+

This can be hard, as leadership is ultimately accountable for addressing +blind spots and understanding timeline risk. There are things you can +do, however, to help improve leadership's knowledge about ML timelines.

+
    +
  • +

    Avoid building hype around narrow progress metrics material only to +the ML team (e.g., "We improved F1 score by 0.2 and have achieved +awesome performance!").

    +
  • +
  • +

    Instead, be realistic, communicate risk, and emphasize real product +impact (e.g., "Our model improvements should increase the number +of conversions by 10%, though we must continue to validate its +performance on additional demographic factors.)

    +
  • +
  • +

    Sharing resources like this a16z primer, +this class from Prof. Pieter +Abbeel, +and this Google's People + AI +guidebook can +increase awareness of your company's leadership.

    +
  • +
+

ML PMs are well-positioned to educate the organization

+

There are two types of ML product managers.

+
    +
  1. +

    Task PMs: These are the more common form of ML PM. They are +generally specialized into a specific product area (e.g. trust and +safety) and have a strong understanding of the particular use +case.

    +
  2. +
  3. +

    Platform PMs: These are a newer form of PMs. They have a broader +mandate to ensure that the ML team (generally centralized in this +context) is highest leverage. They manage workflow and priorities +for this centralized team. To support this, they tend to have a +broad understanding of ML themselves. These PMs are critical for +educating the rest of the company about ML and ensuring that teams +trust the output of models.

    +
  4. +
+

Both types of PMs are crucial for ML success. Platform PMs tend to have +a particularly powerful role to play in pushing an organization's +adoption of machine learning and making it successful.

+

What is "Agile" for ML?

+

There are two options similar to what Agile is for software development +in the ML context. They are shown below:

+

+

They are both structured, data-science native approaches to project +management. You can use them to provide standardization for project +stages, roles, and artifacts.

+

TDSP tends to be more structured and is a strong alternative to the +Agile methodology. CRISP-DM is somewhat higher level and does not +provide as structured a project management workflow. If you genuinely +have a large-scale coordination problem, you can try these frameworks, +but don't otherwise. They can slow you down since they are more oriented +around "traditional" data science and not machine learning.

+

5 - Design

+

Let's talk about how to actually design machine learning products now. +The biggest challenge with designing such products often isn't +implementing them; it's bridging the gap between users' inflated +expectations and the reality.

+

Users often expect extremely sophisticated systems capable of solving +many more problems than they actually can.

+

+

In reality, machine learning systems are more like dogs that are trained +to do a special task; weird little guys with a penchant for distraction +and an inability to do much more than they are explicitly told.

+

+

All this leads to a big gap between what can be done and what users +expect!

+

The Keys to Good ML Product Design

+

In practice, good ML product design bridges users expectations and +reality. If you can help users understand the benefits and limitations +of the model, they tend to be more satisfied. Furthermore, always have +backup plans for model failures! Over-automating systems tends to be a +recipe for unhappy users. Finally, building in feedback loops can really +increase satisfaction over time.

+

There are a couple ways to explain the benefits and limitations of +an ML system to users.

+
    +
  • +

    Focus on the problems it solves, not the fact that the system is +"AI-powered".

    +
  • +
  • +

    If you make the system feel "human-like" (unconstrained input, +human-like responses), expect users to treat it as human-like.

    +
  • +
  • +

    Furthermore, seek to include guardrails or prescriptive interfaces +over open-ended, human-like experiences. A good example of the +former approach is Amazon +Alexa, which has specific +prompts that its ML system responds to.

    +
  • +
+

+

Handling failures is a key part of keeping ML systems users happy. +There's nothing worse than a "smart" system that conks out when you do +something slightly unexpected. Having built-in solutions to solve for +automation issues is extremely important. One approach is letting users +be involved to correct improper responses. Another is to focus on the +notion of "model confidence" and only offer responses when the threshold +is met. A good example of a handling failure approach is how Facebook +recommends photo tags for users, but doesn't go so far as to autoassign.

+

Types of User Feedback

+

How can you collect feedback from users in a way that avoids these +issues? There are different types of user feedback and how they help +with model improvement.

+

+

Let's go across this chart.

+
    +
  1. +

    The simplest form of feedback is indirect implicit feedback. For +example, did the user churn from the product? That tells you +immediately how the user felt about the system without them giving +a clear signal themselves.

    +
  2. +
  3. +

    Another form is direct implicit feedback, which involves the +user "taking the next step". For example, in an automated user +onboarding flow, did the user click through into ensuing steps? +This is trickier to implement, but can be useful for future +training iterations.

    +
  4. +
  5. +

    The next type of feedback is binary explicit feedback, wherein +users are specifically asked (e.g. via thumbs up/down buttons) how +they feel about the model performance.

    +
  6. +
  7. +

    You can make this more sophisticated and add categorical explicit +feedback, which allows users to sort their feedback into various +types.

    +
  8. +
  9. +

    To really get a sense of how users feel, consider offering free +text feedback. This is tricky to use for model training and can +be involved for users, but it's very useful to highlight the +highest friction predictions.

    +
  10. +
  11. +

    The gold standard, of course, are model corrections; they are +free labels!

    +
  12. +
+

Whenever building explicit feedback into ML systems, avoid relying on +users' altruism and be clear about why they should engage in the +feedback. Instead, build positive feedback loops by allowing users to +experience the benefits of their feedback quickly.

+

Great ML product experiences are designed from scratch. ML is a very +specific technology with clear advantages and drawbacks. Design needs to +be thoughtfully executed around these products. It's especially +important to allow users to interact safely with ML products that may +fail in unexpected ways. Always try to find ways to build in feedback +loops to make the ML product better over time.

+

There are tons of resources that can help you get started with this +emerging field.

+ +

In conclusion, we talked through a number of adjacent considerations to +building ML systems and products. In short, you ship the team as much +you do the code; be thoughtful about how you hire, manage, and structure +ML teams as much as ML products!

+

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-8-teams-and-pm/media/image1.png b/course/2022/lecture-8-teams-and-pm/media/image1.png new file mode 100644 index 00000000..72939e4a Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image1.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image10.png b/course/2022/lecture-8-teams-and-pm/media/image10.png new file mode 100644 index 00000000..9f4e92c0 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image10.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image11.png b/course/2022/lecture-8-teams-and-pm/media/image11.png new file mode 100644 index 00000000..626c2e47 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image11.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image12.png b/course/2022/lecture-8-teams-and-pm/media/image12.png new file mode 100644 index 00000000..a3df1093 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image12.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image13.png b/course/2022/lecture-8-teams-and-pm/media/image13.png new file mode 100644 index 00000000..503dd137 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image13.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image2.png b/course/2022/lecture-8-teams-and-pm/media/image2.png new file mode 100644 index 00000000..854ed090 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image2.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image3.png b/course/2022/lecture-8-teams-and-pm/media/image3.png new file mode 100644 index 00000000..ca7e5db0 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image3.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image4.png b/course/2022/lecture-8-teams-and-pm/media/image4.png new file mode 100644 index 00000000..541c6b70 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image4.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image5.png b/course/2022/lecture-8-teams-and-pm/media/image5.png new file mode 100644 index 00000000..9a640cae Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image5.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image6.png b/course/2022/lecture-8-teams-and-pm/media/image6.png new file mode 100644 index 00000000..69c9472d Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image6.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image7.png b/course/2022/lecture-8-teams-and-pm/media/image7.png new file mode 100644 index 00000000..88ec5e57 Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image7.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image8.png b/course/2022/lecture-8-teams-and-pm/media/image8.png new file mode 100644 index 00000000..d407e3ca Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image8.png differ diff --git a/course/2022/lecture-8-teams-and-pm/media/image9.png b/course/2022/lecture-8-teams-and-pm/media/image9.png new file mode 100644 index 00000000..e6940dff Binary files /dev/null and b/course/2022/lecture-8-teams-and-pm/media/image9.png differ diff --git a/course/2022/lecture-9-ethics/index.html b/course/2022/lecture-9-ethics/index.html new file mode 100644 index 00000000..01c6a422 --- /dev/null +++ b/course/2022/lecture-9-ethics/index.html @@ -0,0 +1,3435 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 9: Ethics - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Lecture 9: Ethics

+
+ +
+ +

Lecture by Charles Frye. +Notes by James Le and Vishnu Rachakonda.
+Published October 03, 2022. +Download slides.

+

In this final lecture of FSDL 2022, we'll talk about ethics. After going +through the context of what we mean by ethics, we'll go through three +different areas where ethics come up:

+
    +
  1. +

    Tech Ethics: ethics that anybody who works in the tech industry +broadly needs to think about.

    +
  2. +
  3. +

    ML Ethics: what ethics has specifically meant for the ML +industry.

    +
  4. +
  5. +

    AI Ethics: what ethics might mean in the future where true AGI +exists.

    +
  6. +
+

1 - Overview and Context

+

All ethics lectures are wrong, but some are useful. They are more useful +if we admit and state what our assumptions or biases are. We'll also +talk about three general themes that come up often when ethical concerns +are raised in tech/ML: alignment, trade-offs, and humility.

+

+

In this lecture, we'll approach ethics on the basis of concrete +cases - specific instances where people have raised concerns. We'll +talk about cases where people have taken actions that have led to +claims and counter-claims of ethical or unethical behavior - such as +the use of automated weapons, the use of ML systems to make decisions +like sentencing and bail, and the use of ML algorithms to generate art. +In each case when criticism has been raised, part of that criticism has +been that the technology is unethical.

+

Approaching ethics in this way allows us to answer the question of "What +is ethics?" by way of Ludwig Wittgenstein's quote: "The meaning of a +word is its use in the language." We'll focus on times when people have +used the word "ethics" to describe what they like or dislike about a +specific technology.

+

If you want to try it out for yourself, you should check out the game +"Something Something Soup +Something." In this browser +game, you are presented with a bunch of dishes and have to decide +whether they are soup or not soup, as well as whether they can be served +to somebody who ordered soup. By playing a game like this, you'll +discover (1) how difficult it is to come up with a concrete definition +of soup and (2) how poorly your working definition of soup fits with any +given soup theory.

+

Because of this case-based approach, we won't be talking about ethical +schools or "trolley" problems. Rather than considering these +hypothetical +scenarios, +we'll talk about concrete and specific examples from the past decade of +work in our field and adjacent fields.

+

+

If you want another point of view that emphasizes the trolley problems, +you should check out Sergey's lecture from the last edition of the +course from +2021. +It presented similar ideas from a different perspective and came to the +same conclusion and some different conclusions.

+

A useful theme from that lecture that we should all have in mind when we +ponder ethical dilemmas is "What Is Water?" - which came up from a +famous commencement speech by David Foster +Wallace. If +we aren't thoughtful and paying attention, things that are very +important can become background, assumptions, and invisible to us.

+

The approach of relying on prominent cases risks replicating social +biases. Some ethical claims are amplified and travel more because +people (who are involved) have more resources and are better connected. +Using these forms of case-based reasoning (where you explain your +beliefs in concrete detail) can hide the principles that are actually +in operation, making them disappear like water.

+

But in the end, so much of ethics is deeply personal that we can't +expect to have a perfect approach. We can just do the best we can and +hopefully become better every day.

+

2 - Themes

+

We'll see three themes repeatedly coming up throughout this lecture:

+
    +
  1. +

    Alignment: a conflict between what we want and what we get.

    +
  2. +
  3. +

    Trade-Offs: a conflict between what we want and what others +want.

    +
  4. +
  5. +

    Humility: a response when we don't know what we want or how to +get it.

    +
  6. +
+

Alignment

+

The problem of alignment (where what we want and what we get differ) +come up over and over again. A primary driver of this is called the +proxy problem - in which we often optimize or maximize some proxies +for the thing that we really care about. If the alignment (or loosely +the correlation between that proxy and the thing we care about) is poor +enough, then by trying to maximize that proxy, we can end up hurting the +thing we originally cared about.

+

+

There was a recent +paper that did a +mathematical analysis of this idea. You can see these kinds of proxy +problems everywhere once you look for them.

+
    +
  • +

    On the top right, we have a train and validation loss chart from one +of the training runs for the FSDL text recognizer. The thing we +can optimize is the training loss. That's what we can use to +calculate gradients and improve the parameters of our network. But +the thing we really care about is the performance of the network +on data points that it has not seen (like the validation set, the +test set, or data in production). If we optimize our training loss +too much, we can actually cause our validation loss to go up.

    +
  • +
  • +

    Similarly, there was an interesting +paper +suggesting that increasing your accuracy on classification tasks +can actually result in a decrease in the utility of your +embeddings in downstream tasks.

    +
  • +
  • +

    You can find these proxy problems outside of ML as well. This +thread +reveals an example where a factory that was making chemical +machines (rather than creating a machine that was cheaper and +better) chose not to adopt producing that machine because their +output was measured in weight. So the thing that the planners +actually cared about, economic efficiency and output, was not +optimized because it was too difficult to measure.

    +
  • +
+

One reason why these kinds of proxy problems arise so frequently is due +to issues of information. The information that we are able to measure +is not the information that we want. At a higher level, we often don't +know what it is that we truly needed. We may want the validation loss, +but what we need is the loss in production or really the value our users +will derive from this model.

+

Trade-Offs

+

Even when we know what we want or what we need, we are likely to run +into the second problem - the tradeoff between stakeholders. It is +sometimes said that the need to negotiate tradeoffs is one of the +reasons why engineers do not like thinking about some of these problems +around ethics. That's not quite right because we do accept tradeoffs as +a key component of engineering.

+ +

+

The famous chart above compares the different convolutional networks on +the basis of their accuracy and the number of operations to run them. +Thinking about these tradeoffs between speed and correctness is exactly +the thing we have to do all the time in our job as engineers.

+

We can select the Pareto Front for the metrics we care about. A way +to remember what a Pareto front is this definition of a data scientist +from Josh +Wills: +"Person who is better at statistics than any software engineer and +better at software engineering than any statistician." The Pareto Front +in the chart above includes the models that are more accurate than those +with fewer FLOPs and use fewer FLOPs than those that are more accurate.

+

A reason why engineers may dislike thinking about these problems is that +it's hard to identify and quantify these tradeoffs. These are indeed +proxy problems. Even further, once measured, where on that front do we +fall? As engineers, we may develop expertise in knowing whether we want +high accuracy or low latency, but we are not as comfortable deciding how +many current orphans we want to trade for what amount of future health. +This raises questions both in terms of measurement and decision-making +that are outside of our expertise.

+

Humility

+

The appropriate response is humility because most engineers do not +explicitly train in these skills. Many engineers and managers in tech, +in fact, constitutionally prefer optimizing single metrics that are not +proxies. Therefore, when encountering a different kind of problem, it's +important to bring a humble mindset, ask for help from experts, and +recognize that the help you get might not be immediately obvious to what +you are used to.

+

Additionally, when intervening due to an ethical concern, it's important +to remember this humility. It's easy to think that when you are on the +good side, this humility is not necessary. But even trying to be helpful +is a delicate and dangerous undertaking. We want to make sure that as we +resolve ethical concerns, we come up with solutions that are not just +parts of the problem.

+

User Orientation Undergirds Each Theme

+

We can resolve all of these via user orientation.

+
    +
  1. +

    By getting feedback from users, we maintain alignment between +our system and our users.

    +
  2. +
  3. +

    When making tradeoffs, we should resolve them in consultation +with users.

    +
  4. +
  5. +

    Humility means we actually listen to our users because we +recognize we don't have the answers to all the questions.

    +
  6. +
+

3 - Tech Ethics

+

The tech industry can't afford to ignore ethics as public trust in tech +declines. We need to learn from other nearby industries that have done a +better job on professional ethics. We'll also touch on some contemporary +topics.

+

Tech Industry's Ethical Crisis

+

Throughout the past decade, the tech industry has been plagued by +scandal - whether that's how tech companies interface with national +governments at the largest scale or how tech systems are being used or +manipulated by people who create disinformation or fake social media +accounts that hack the YouTube recommendation system.

+

As a result, distrust in tech companies has risen markedly in the last +ten years. This Public Affairs Pulse +survey +shows that in 2013, the tech industry was one of the industries with +less trustworthiness on average. In 2021, it has rubbed elbows with +famously more distrusted industries such as energy and pharmaceuticals.

+

+

Politicians care quite a bit about public opinion polls. In the last few +years, the fraction of people who believe that large tech companies +should be more regulated has gone up a substantial amount. Comparing +it to 10 years ago, it's astronomically +higher. +So there will be a substantial impact on the tech industry due to this +loss of public trust.

+

We can learn from nearby fields: from the culture of professional ethics +in engineering in Canada (by wearing the Iron +Ring) to ethical +standards for human subjects research (Nuremberg +Code, 1973 +National Research +Act). +We are at the point where we need a professional code of ethics for +software. Hopefully, many codes of ethics developed in different +communities can compete with each other and merge into something that +most of us can agree on. That can be incorporated into our education for +new members of our field.

+

Let's talk about two particular ethical concerns that arise in tech in +general: carbon emissions and dark/user-hostile design patterns.

+

Tracking Carbon Emissions

+

Because carbon emissions scale with cost, you only need to worry about +them when the costs of what you are working on are very large. Then you +won't be alone in making these decisions and can move a bit more +deliberately to make these choices more thoughtfully.

+

Anthropogenic climate change from carbon emissions raises ethical +concerns - tradeoffs between the present and future generations. The +other view is that this is an issue that arises from a classic alignment +problem: many organizations are trying to maximize their profit, which +is based on prices for goods that don't include externalities (such as +environmental damage caused by carbon emissions, leading to increased +temperatures and lactic change).

+

+

The primary dimension along which we have to worry about carbon +emissions is in compute jobs that require power. That power can +result in carbon emissions. This +paper walks through +how much carbon dioxide was emitted using typical US-based cloud +infrastructure.

+
    +
  • +

    The top headline shows that training a large Transformer model with +neural architecture search produces as much carbon dioxide as five +cars create during their lifetimes.

    +
  • +
  • +

    It's important to remember that power is not free. On US-based cloud +infrastructure, \$10 of cloud spent is roughly equal to \$1 of air +travel costs. That's on the basis of something like the numbers +and the chart indicating air travel across the US from New York to +San Francisco.

    +
  • +
  • +

    Just changing cloud regions can actually reduce your emissions quite +a bit. There's a factor of +50x +from regions with the most to least carbon-intensive power +generation.

    +
  • +
+

The interest in this problem has led to new tools. +Codecarbon.io allows you to +track power consumption and reduce carbon emissions from your computing. +ML CO2 Impact is +oriented directly towards machine learning.

+

Deceptive Design and Dark Patterns

+

The other ethical concern in tech is deceptive design. An +unfortunate amount of deception is tolerated in some areas of software. +As seen below, on the left is a nearly complete history of the way +Google displays ads in its search engine results. It started off very +clearly colored and separated out with bright colors from the rest of +the results. Then about ten years ago, that colored background was +removed and replaced with a tiny little colored snippet that said "Ad." +Now, as of 2020, that small bit is no longer even colored. It is just +bolded. This makes it difficult for users to know which content is being +served to them because somebody paid for it (versus content served up +organically).

+

+

A number of dark patterns of deceptive design have emerged over the +last ten years. You can read about them on the website called +deceptive.design. There's +also a Twitter account called +\@darkpatterns that +shares examples found in the wild.

+

A practice in the tech industry that's on a very shaky ethical /legal +ground is growth hacking. This entails a set of techniques for +achieving rapid growth in user base or revenue for a product and has all +the connotations you might expect from the name - with examples +including LinkedIn and Hotmail.

+

+

ML can actually make this problem worse if we optimize short-term +metrics. These growth hacks and deceptive designs can often drive user +and revenue growth in the short term but worsen user experience and draw +down on goodwill towards the brand in a way that can erode the long-term +value of customers. When we incorporate ML into the design of our +products with A/B testing, we have to watch out to make sure that the +metrics that we are optimizing do not encourage this kind of deception.

+

These arise inside another alignment problem. One broadly-accepted +justification for the private ownership of the means of production is +that private enterprise delivers broad social value aligned by price +signals and market focus. But these private enterprises optimize metrics +that are, at best, a proxy for social value. There's the possibility of +an alignment problem where companies pursuing and maximizing their +market capitalization can lead to net negative production of value. If +you spend time at the intersection of funding, leadership, and +technology, you will encounter it.

+

+

In the short term, you can push for longer-term thinking within your +organization to allow for better alignment between metrics and goals +and between goals and utility. You can also learn to recognize +user-hostile designs and advocate for user-centered design instead.

+

To wrap up this section on tech ethics:

+
    +
  1. +

    The tech industry should learn from other disciplines if it wants to +avoid a trust crisis.

    +
  2. +
  3. +

    We can start by educating ourselves about common deceptive or +user-hostile practices in our industry.

    +
  4. +
+

4 - ML Ethics

+

The ethical concerns raised about ML have gone beyond just the ethical +questions about other kinds of technology. We'll talk about common +ethical questions in ML and lessons learned from Medical ML.

+

Why Not Just Tech Ethics?

+

ML touches human lives more intimately than other technologies. Many ML +methods, especially deep neural networks, make human-legible data into +computer-legible data. Humans are more sensitive to errors and have more +opinions about visual and text data than they do about the type of data +manipulated by computers. As a result, there are more stakeholders with +more concerns that need to be traded off in ML applications.

+

Broadly speaking, ML involves being wrong pretty much all the time. Our +models are statistical and include "randomness." Randomness is almost +always an admission of ignorance. As we admit a certain degree of +ignorance in our models, our models will be wrong and misunderstand +situations that they are put into. It can be upsetting and even harmful +to be misunderstood by our models.

+

Against this backlash of greater interest or higher stakes, a number of +common types of ethical concerns have coalesced in the last couple of +years. There are somewhat established camps of answers to these +questions, so you should at least know where you stand on the four core +questions:

+
    +
  1. +

    Is the model "fair"?

    +
  2. +
  3. +

    Is the system accountable?

    +
  4. +
  5. +

    Who owns the data?

    +
  6. +
  7. +

    Should the system be built at all?

    +
  8. +
+

Common Ethical Questions in ML

+

Is The Model "Fair"?

+

The classic case on this comes from criminal justice with the COMPAS +system +for predicting whether a defendant will be arrested again before trial. +If they are arrested again, that suggests they committed a crime during +that time. This assesses a certain degree of risk for additional harm +while the justice system decides what to do about a previous arrest and +potential crime.

+

The operationalization here was a 10-point re-arrest probability based +on past data about this person, and they set a goal from the very +beginning to be less biased than human judges. They operationalize that +by calibrating these arrest probabilities across subgroups. Racial bias +is a primary concern in the US criminal justice system, so they took +care to make sure that these probabilities of re-arrest were calibrated +for all racial groups.

+

+

The system was deployed and used all around the US. It's proprietary and +difficult to analyze. But using the Freedom of Information Act and +coalescing together a bunch of records, people at ProPublica were able +to run their own analysis of this +algorithm. +They determined that the model was not more or less wrong for one racial +group or another. It tended to have more false positives for Black +defendants and more false negatives for White defendants. So despite the +creators of COMPAS taking into account bias from the beginning, they +still ended up with an algorithm with this undesirable property of being +more likely to falsely accuse Black defendants than White defendants.

+

It turned out that some quick algebra revealed that some form of +race-based bias is inevitable in this setting, as indicated in this +paper. There are a large +number of fairness definitions that are mutually incompatible. This +tutorial by Arvind +Narayanan +is an excellent one to display them.

+

It is noteworthy that the impact of "unfairness" is not fixed. The +story is often presented as "no matter what, the journalists would have +found something to complain about." But note that equalizing false +positive rates and positive predictive value across groups would lead to +a higher false negative rate for Black defendants relative to White +defendants. In the context of American politics, that's not going to +lead to complaints from the same people.

+

+

This is the story about the necessity of confronting the tradeoffs that +will inevitably come up. Researchers at Google made a nice little +tool +where you can think through and make these tradeoffs for yourself. It's +helpful for building intuition on these fairness metrics and what it +means to pick one over the other.

+

Events in this controversy kicked off a flurry of research on fairness. +The Fairness, Accountability, and Transparency +conference has been held for +several years. There has been a ton of work on both algorithmic-level +approaches on measuring and incorporating fairness metrics into +training and qualitative work on designing systems that are more +transparent and accountable.

+

In the case of COMPAS, re-arrest is not the same as recidivism. +Being rearrested requires that a police officer believes you committed a +crime. Police officers are subject to their own biases and patterns of +policing, which result in a far higher fraction of crimes being caught +for some groups than for others. Our real goal, in terms of fairness and +criminal justice, might be around reducing those kinds of unfair impacts +and using past rearrest data that have these issues.

+

Representation Matters for Model Fairness

+

+

Unfortunately, it is easy to make ML-powered tech that fails for +minoritized groups. For example, off-the-shelf computer vision tools +often fail on darker sins (as illustrated in this talk by Joy +Buolamwini). +This is not a new issue in technology, just a more salient one with ML.

+

There has been a good amount of progress on this in the last five years. +An example is Google's Model +Cards which show +how well a model will perform on human subgroups of interest. +HuggingFace has good integrations for creating these kinds of model +cards.

+

When you invite people for talks or hire people to join your +organizations, you should work to reduce the bias of that discovery +process by diversifying your network. Some good resources include +Black in AI, Diversify +Tech Job Board, +Women in Data Science, +and the You Belong in AI +podcast. You can make +professional connections via them to improve the representation of +minoritized groups in the engineering, design, and product management +process.

+

Is The System Accountable?

+

At a broader level than fairness, we should expect "accountability" from +ML systems. Some societies and states, including the EU, consider "the +right to an explanation" +in the face of important judgments to be a part of human rights.

+

In the GDPR act, there is a section that enshrines +accountability. +This isn't quite a totally new requirement; credit denials in the US +have been required to be explained since 1974. People have a right to +know what and why into making decisions for them!

+

If you want to impose this "accountability" on a deep neural network and +understand its selections, there are a number of methods that use the +input-output gradient to explain the model. You can see a list of +several methods in order of increasing performance below (from this +paper). These approaches +don't quite have strong theoretical underpinnings or a holistic +explanation, and are not that robust as a result. A lot of these methods +act primarily as edge detectors. The paper shows how even randomizing +layers in a model does not materially change the interpretability output +of GradCAM methods.

+

+

As a result, introspecting DNNs effectively requires reverse engineering +the system to really understand what is going on, largely thanks to +efforts like Distil and +Transfomer Circuits.

+

Due to these technical challenges, machine learning systems are prone to +unaccountability that impacts most those least able to understand and +influence their outputs. Books such as Automating +Inequality +describe the impacts of these systems. In such a context, you should +seek to question the purpose of model, involve those impacted by the +decisions (either through direct human inputs or through other means), +and ensure that equal attention is paid to benefits and harms of +automation.

+

Who Owns The Data?

+

Humans justifiably feel ownership of the data they create, which is +subsequently used to train machine learning models. Large datasets used +to train models like GPT-3 are created by mining this data without the +explicit involvement of those who create the data. Many people are not +aware that this is both possible and legal. As technology has changed, +what can be done with data has changed.

+

You can even verify if your data has been used to train models +on. Some of these images +are potentially obtained +illegally, +as a result of sensitive data being posted openly without the recorded +consent of the originator.

+

+

Each of these controversies around image generation and illegal data has +opened up a new frontier in data governance. Focus will be placed on +ensuring new ML systems are sensitive to personal and professional +concerns of those who generate the data ML systems are trained on. +Emad Mostaque, CEO +of Stability AI, has gone so far +as to offer future opt out systems from systems similar to Stable +Diffusion.

+

Here are some practical tips: Dataset +cards +can be helpful in providing documentation in a similar fashion to model +cards. There are also ethics lists, like the deon ethic +checklist that +helps design proper systems. Deon also has a helpful list of failure +cases.

+

Should This Be Built At All?

+

The undercurrent behind this conversation is the justifiable question of +whether some of these systems should be built at all, let alone in an +ethical way.

+

ML-powered weaponry is the canonical example here, which is already +in use. The definition of these systems are blurry, as both systems old +and new have had various autonomous capacities. This is difficult to get +a sense of due to the secrecy associated with weapon systems.

+

Some have argued that "autonomous weapons" have existed for hundreds of +years, but even this does not mean that they are ethical. Mines are good +examples of these systems. Movements like the Campaign Against Killer +Robots are +trying to prevent the cycle we entered with mines - where we invented +them, when we realized the incredible harm, and why we are trying to ban +them. Why invent these at all?

+

Let's wrap up this entire section with some closing questions that you +should always have a thoughtful answer to as you build a machine +learning system.

+
    +
  1. +

    Is the model "fair"? Fairness is possible, but requires +trade-offs.

    +
  2. +
  3. +

    Is the system accountable? Accountability is easier than +interpretability.

    +
  4. +
  5. +

    Who owns the data? Answer this upfront. Changes are on the way.

    +
  6. +
  7. +

    Should the system be built at all? Repeatedly ask this and use +it to narrow scope.

    +
  8. +
+

What Can We Learn from Medical ML

+

Note: The FSDL team would like to thank Dr. Amir Ashraf +Ganjouei +for his feedback on this section.

+

Interestingly, medicine can teach us a lot about how to apply machine +learning in a responsible way. Fundamentally, this has led to a mismatch +between how medicine works and how machine learning systems are built +today.

+

Let's start with a startling fact: the machine learning response to +COVID-19 was an abject failure. In contrast, the biomedical response +was a major triumph. For example, the vaccines were developed with +tremendous speed and precision.

+

+

Machine learning did not acquit itself well with the COVID-19 problem. +Two reviews (Roberts et al., +2021 +and Wynants et al., +2020-2022) +found that nearly all machine learning models were insufficiently +documented, had little to no external validation, and did not follow +model development best practices. A full 25% of the papers used a +dataset incorrect for the task, which simply highlighted the difference +between children and adults, not pneumonia and COVID.

+

Medicine has a strong culture of ethics that professionals are +integrated into from the point they start training. Medical +professionals take the Hippocratic oath of practicing two things: either +help or do not harm the patient. In contrast, the foremost belief +associated with software development tends to be the infamous "Move fast +and break things." While this approach works for harmless software like +web apps, it has serious implications for medicine and other more +critical sectors. Consider the example of a retinal implant that was +simply deprecated by developers and left hundreds without sight in +this Statnews +article.

+

+

Researchers are drawing inspiration from medicine to develop similar +standards for ML.

+
    +
  • +

    For example, clinical trial standards have been extended to ML. +These standards were developed through extensive surveys, +conferences, and consensus building (detailed in +these +papers).

    +
  • +
  • +

    Progress is being made in understanding how this problem presents. +A recent +study +found that while clinical activities are generally performed at a +high compliance level, statistical and data issues tend to suffer +low compliance.

    +
  • +
  • +

    New approaches are developing entire "auditing" +procedures +that exquisitely identify the activities required to effectively +develop models.

    +
  • +
+

Like medicine, machine learning is intimately intertwined with people's +lives. The most important question to ask is "Should this system be +built at all?". Always ask yourselves this and understand the +implications!

+

5 - AI Ethics

+

AI ethics are a frontier in both the technology and the ethics worlds. +False claims and hype are the most pressing concerns, but other risks +could present themselves soon.

+

AI Snake Oils

+

False claims outpace the performance of AI. This poses a serious +threat to adoption and satisfaction with AI systems long term.

+
    +
  • +

    For example, if you call something "AutoPilot", people might truly +assume it is fully autonomous, as happened in the below case of a +Tesla user. This goes back to our discussion about how AI systems +are more like funky dogs than truly human intelligent systems.

    +
  • +
  • +

    Another example of this is IBM's Watson +system, +which went from tackling the future of healthcare to being sold +off for parts.

    +
  • +
+

+

These false claims tend to be amplified in the media. But this isn't +confined to traditional media. Even Geoff Hinton, a godfather of modern +machine learning, has been a little too aggressive in his forecasts +for AI +performance!

+

You can call this "AI Snake Oil" as Arvind Narayanan does in his +Substack and +talk.

+

Let's separate out where true progress has been made versus where +progress is likely to be overstated. On some level, AI perception has +seen tremendous progress, AI judgment has seen moderate progress, and AI +prediction of social outcomes has seen not nearly as much progress.

+

+

Frontiers: AI Rights and X-Risk

+

There's obvious rationale that should artificial sentient beings exist, +tremendous ethical implications would be raised. Few people believe that +we are truly on the precipice of sentient beings, but there is +disagreement on how close we are.

+

+

There's a different set of concerns around how to regard self-improving +intelligent beings, for which there is already evidence. Large Language +Models have been show to be able to improve themselves in a range of +studies +(here and +here).

+

Failing to pursue this technology would lead to a huge opportunity +cost (as +argued by Nick Bostrom)! There truly is a great opportunity in having +such systems help us sold major problems and lead better lives. The key +though, is that such technology should be developed in the safest way +possible, not the fastest way.

+

The paperclip +problem +shows how the potential for misalignment between AI systems and humans +could dramatically reduce human utility and even compromise our +interests. Imagine a system designed to manufacture paperclips... could +actually develop the intelligence to alter elements of society to favor +paper clips?! This thought experiments illustrates how self-learning +systems could truly change our world for the worse in a misaligned way.

+

These ideas around existential risk are most associated with the +Effective Altruism community. +Check out resources like Giving What We +Can +and 80,000 Hours if you're +interested!

+

6 - What Is To Be Done?

+

This course can't end on a dour a note as existential risk. What can be +done to mitigate these consequences and participate in developing truly +ethical AI?

+
    +
  1. +

    The first step is to educate yourself on the topic. There are +many great books that give lengthy, useful treatment to this +topic. We recommend Automating +Inequality, +Weapons of Math +Destruction, +and The Alignment +Problem.

    +
  2. +
  3. +

    After reading this, consider how to prioritize your actions. +What do you want to impact? When do you want to do that? Place +them in this two-by-two to get a sense of where their importance +is.

    +
  4. +
+

+

Ethics cannot be purely negative. We do good, and we want to +prevent bad! Focus on the good you can do and be mindful of the harm +you can prevent.

+

Leading organizations like +DeepMind +and OpenAI are leading from +the front. Fundamentally, building ML well aligns with building ML for +good. All the leading organizations emphasize effective and +responsible best practices for building ML powered practices. Keep all +this in mind as you make the world a better place with your AI-powered +products!

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/2022/lecture-9-ethics/media/image1.png b/course/2022/lecture-9-ethics/media/image1.png new file mode 100644 index 00000000..c6758511 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image1.png differ diff --git a/course/2022/lecture-9-ethics/media/image10.png b/course/2022/lecture-9-ethics/media/image10.png new file mode 100644 index 00000000..e9d0ff91 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image10.png differ diff --git a/course/2022/lecture-9-ethics/media/image11.png b/course/2022/lecture-9-ethics/media/image11.png new file mode 100644 index 00000000..7f0791b2 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image11.png differ diff --git a/course/2022/lecture-9-ethics/media/image12.png b/course/2022/lecture-9-ethics/media/image12.png new file mode 100644 index 00000000..e7dc4b1d Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image12.png differ diff --git a/course/2022/lecture-9-ethics/media/image13.png b/course/2022/lecture-9-ethics/media/image13.png new file mode 100644 index 00000000..638de850 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image13.png differ diff --git a/course/2022/lecture-9-ethics/media/image14.png b/course/2022/lecture-9-ethics/media/image14.png new file mode 100644 index 00000000..381e32bb Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image14.png differ diff --git a/course/2022/lecture-9-ethics/media/image15.png b/course/2022/lecture-9-ethics/media/image15.png new file mode 100644 index 00000000..9d75bcda Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image15.png differ diff --git a/course/2022/lecture-9-ethics/media/image16.png b/course/2022/lecture-9-ethics/media/image16.png new file mode 100644 index 00000000..c16957e7 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image16.png differ diff --git a/course/2022/lecture-9-ethics/media/image17.png b/course/2022/lecture-9-ethics/media/image17.png new file mode 100644 index 00000000..998bc5f4 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image17.png differ diff --git a/course/2022/lecture-9-ethics/media/image18.png b/course/2022/lecture-9-ethics/media/image18.png new file mode 100644 index 00000000..098673aa Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image18.png differ diff --git a/course/2022/lecture-9-ethics/media/image19.png b/course/2022/lecture-9-ethics/media/image19.png new file mode 100644 index 00000000..53264020 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image19.png differ diff --git a/course/2022/lecture-9-ethics/media/image2.png b/course/2022/lecture-9-ethics/media/image2.png new file mode 100644 index 00000000..10f3c55f Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image2.png differ diff --git a/course/2022/lecture-9-ethics/media/image20.png b/course/2022/lecture-9-ethics/media/image20.png new file mode 100644 index 00000000..08bfe3d9 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image20.png differ diff --git a/course/2022/lecture-9-ethics/media/image3.png b/course/2022/lecture-9-ethics/media/image3.png new file mode 100644 index 00000000..b3b78634 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image3.png differ diff --git a/course/2022/lecture-9-ethics/media/image4.png b/course/2022/lecture-9-ethics/media/image4.png new file mode 100644 index 00000000..76450bc5 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image4.png differ diff --git a/course/2022/lecture-9-ethics/media/image5.png b/course/2022/lecture-9-ethics/media/image5.png new file mode 100644 index 00000000..5e135886 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image5.png differ diff --git a/course/2022/lecture-9-ethics/media/image6.png b/course/2022/lecture-9-ethics/media/image6.png new file mode 100644 index 00000000..30ca46c4 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image6.png differ diff --git a/course/2022/lecture-9-ethics/media/image7.png b/course/2022/lecture-9-ethics/media/image7.png new file mode 100644 index 00000000..63f36709 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image7.png differ diff --git a/course/2022/lecture-9-ethics/media/image8.png b/course/2022/lecture-9-ethics/media/image8.png new file mode 100644 index 00000000..70d5e715 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image8.png differ diff --git a/course/2022/lecture-9-ethics/media/image9.png b/course/2022/lecture-9-ethics/media/image9.png new file mode 100644 index 00000000..e1f09829 Binary files /dev/null and b/course/2022/lecture-9-ethics/media/image9.png differ diff --git a/course/2022/project-showcase/index.html b/course/2022/project-showcase/index.html new file mode 100644 index 00000000..2b89a256 --- /dev/null +++ b/course/2022/project-showcase/index.html @@ -0,0 +1,2970 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Project Showcase - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Project Showcase

+

Students who registered for the synchronous version of the course +formed teams and worked on their own deep learning-powered products.

+

Whether you're looking for your next startup idea +or deciding how to improve your portfolio, +we hope these projects inspire you to build something real with DNNs!

+
+

Info

+

Many of these projects were made possible thanks to a generous donation of GPU-accelerated compute infrastructure by LambdaLabs. Check them out if you're looking for on-prem or cloud GPU machines!

+
+

If you're interested in working on full stack projects, +join us on Discord +and post/ask around about group project work.

+

Course Co-Pilot

+
+ +
+ +

An ML powered application for streamlining the process of creating chapter markers and lesson summaries for course content creators.

+

Team: Kurian Benoy, Wayde Gilliam, Suvash Thapaliya
+Live Demo. +Slide Deck.

+

Admirer

+
+ +
+ +

A full-stack ML-powered website that utilizes users’ webcam feeds to answer open-ended questions requiring outside knowledge.

+

Team: Andrew Hinh
+Live Demo.

+

Green-Screen Image Composition-Transfer

+
+ +
+ +

An ML-powered app for adding (optionally Stable Diffusion-generated) +virtual backgrounds to images that uses style transfer +to match lighting anad composition.

+

Team: Nitin Kishore Sai Samala
+Live Demo. +Poster.

+

Weak Supervision and Active Learning with Text Data

+
+ +
+ +

An approach to minimise human labelling for text classification tasks.

+

Team: Aleks Hiidenhovi, +Bernardo García, +Diego Quintana, +Edoardo Abati, +Juan Manuel, +Kushal Ramaiya
+GitHub Repo. +Slide Deck.

+

X-Ray Diagnosis AI Assistant

+
+ +
+ +

An interface to support medical practitioners in diagnosing and interpreting x-ray images.

+

Team: Arun Hegde, Samarth Keshari, Amulya Badal, Ross Cheung, Seyhan Karakulak
+GitHub Repo.

+

Mom's AI Food Logger

+
+ +
+ +

An app for my mom that automatically identifies and tracks the food she eats.

+

Team: Prince Javier
+Live Demo.

+

Archaeological Feature Detector

+
+ +
+ +

A prototype web app to help archaeologists interpret automatically detected objects as part of a machine-learning-powered survey workflow.

+

Team: Philanoe, +jmmoreu, +Kemp, +lakillo
+Slide Deck.

+

Semantic Search Engine for Images

+
+ +
+ +

A semantic text search engine over images, along with monitoring.

+

Team: Sandhya Govindaraju, Utkarsh Vardhan, Gabriella Chiribau, Amit Kumar Sah
+Live Demo. +Slide Deck.

+

Recipewise

+
+ +
+ +

An image to recipe food classifier.

+

Team: Carlo David, Chavo Kim, George Loh, Nari Jeong, and Rina Buoy
+Live Demo. +Slide Deck.

+

ImageIN

+
+ +
+ +

A pragmatic approach to identifying illustrated pages in digitised historic books.

+

Team: Frank Smitskamp, Zacharie Bouhnik, Daniel van Strien
+Live Demo. +GitHub Repo.

+

Full Stack Stable Diffusion

+
+ +
+ +

A deployment of Stable Diffusion Text-to-Image and Image-to-Image pipelines with a full stack architecture.

+

Team: Okan Ulusoy and Omid Abdollahi Aghdam
+GitHub Repo.

+

Multimodal Fusion Models for Healthcare

+
+ +
+ +

An architecture for using multiple modalities of healthcare data to train deep learning models.

+

Team: Vinod Nair, Khoa Nguyen, Carlos Leyson, Kieran Didi, Sridhar Iyer, Alan Chang
+GitHub Repo. +Slide Deck.

+

FiberDiameter.com

+
+ +
+ +

Measure the diameter of nanofibers in microscopy images.

+

Team: @f_cossio, +@yael_su, +@__coszio, +@aledelunap
+Live Demo. +Slide Deck.

+

GeoLocator

+
+ +
+ +

An app that guesses the location of an image, video, or video url.

+

Team: David Hrachovy, Samhita Alla, +Yiyi Xu, +Winson Truong
+Live Demo. +GitHub Repo.

+

👻 Image Anonymiser

+
+ +
+ +

An ML-powered image anonymisation web app.

+

Team: Sami Saadaoui, Vladislav Vancak, Lawrence Francis, Dan Howarth, Allan Stevenson
+GitHub Repo. +Project Page.

+

BuggingSpace

+
+ +
+ +

An interface for red-teaming open source text generation models from the Hugging Face hub.

+

Team: Sashank Pisupati, Sajenthan Vigneswaran, Kemp Bray, Jean-Antoine Zagato
+Live Demo. +Slide Deck.

+

Board Game Rules Explainer

+
+ +
+ +

A board game question-answering system to save players from having to check the rulebook.

+

Team: Rafal Wojcik, Tim Jones, Issam Hammi, Muriel Hol
+Live Demo. +GitHub Repo. +Slide Deck.

+

Gesto AI - ASL Word Recognizer

+
+ +
+ +

A real-time, word-level American Sign Language translation app.

+

Team: Shivam Arora, Daniel Firebanks-Quevedo, Pablo Oberhauser, Dhruv Shah, Ibrahim Sherif, Samuel Tang
+Live Demo. +Slide Deck.

+

choosistant

+
+ +
+ +

Choosistant helps you decide which product to buy by summarizing pros and cons from written reviews.

+

Team: Kimotho, Murad Khalilov, Nam, Omar Ali Sheikh, Sofiane Chami
+Project Page.

+

Semantic Search & Sentiment Analysis

+
+ +
+ +

Upload a PDF or text document and enable semantic QA and sentiment analysis.

+

Team: Sam Tonetto, Francisco Perez-Sorrosal, Navaneeth Tirupathi, Alexander Chia, Priyam Sadhukhan
+Project Repo. +Slide Deck.

+

In-Browser AI

+
+ +
+ +

Run modern neural networks directly in your browser from a computer or phone.

+

Team: Alexander Visheratin
+Live Demo. +Slide Deck.

+

Audio VTuber

+
+ +
+ +

Animate a cartoon with facial expressions using only your voice.

+

Team: Alex Service, Moon Ma
+Live Demo. +Slide Deck.

+

OCR SemSearch

+
+ +
+ +

OCR SemSearch allows you to perform semantic search on text within images from different types of documents.

+

Team: Sebastian Gonzalez Aseretto, Paramtap Mewada
+Project Poster.

+

Live Art in Context

+
+ +
+ +

Draw on the creative power of modern ML models to create art responsive to events in text or video streams.

+

Team: David Murphy, Angel Carvajal, Theresa Thoraldson, Chris Lonsberry
+Slide Deck.

+

Plant ID

+
+ +
+ +

A plant species identifier available as a web app and as a cross-platform mobile app.

+

Team: Ben Michel, Navid Matin Moghaddam, Stone Jiang, Shishir Valalla, Vibudh Singh
+Live Demo. +Slide Deck.

+

Landcover Segmentation

+
+ +
+ +

A data product for multi-class semantic segmentation of earth observation images using a UNet architecture. +
+Team: Suzana, +Roland Ritt, +Sheebo

+

Slide Deck.

+

Meeting Graph Generator

+
+ +
+ +

Team: Soroush +Bassam
+Slide Deck. +GitHub Repo.

+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/course/index.html b/course/index.html new file mode 100644 index 00000000..f5118b7f --- /dev/null +++ b/course/index.html @@ -0,0 +1,2233 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deep Learning Courses - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + + + +
+
+ + + + + + + + +

Full Stack Deep Learning Courses

+

The Full Stack Deep Learning course started in 2018, as a three-day bootcamp hosted on Berkeley campus. +Since then, we've hosted several in-person bootcamps, online courses, and official university courses.

+
+

Looking for the most recent FSDL materials?

+

You can find them here.

+
+

Testimonials

+ + +

Past Iterations

+
    +
  • FSDL 2022 (Online): A fully online course, taught via YouTube, Crowdcast, and Discord.
  • +
  • FSDL 2021 (Online): Contemporaneous with the Berkeley course, we taught an online cohort course.
  • +
  • FSDL 2021 (Berkeley): Taught as a UC Berkeley undergrad course CS194-080 in Spring 2021
  • +
  • FSDL 2020 (UW): Taught as University of Washington Professional Master's Program course CSEP 590C in Spring 2020
  • +
  • FSDL 2019 (Online): Materials from the November 2019 bootcamp held on Berkeley campus organized in a nice online format.
  • +
  • FSDL 2019 (Bootcamp): Raw materials from the March 2019 bootcamp, held on Berkeley campus.
  • +
  • FSDL 2018 (Bootcamp): Our first bootcamp, held on Berkeley campus in August 2018
  • +
+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/images/1f95e.png b/images/1f95e.png new file mode 100644 index 00000000..a775b29c Binary files /dev/null and b/images/1f95e.png differ diff --git a/images/charles.png b/images/charles.png new file mode 100644 index 00000000..40aabecc Binary files /dev/null and b/images/charles.png differ diff --git a/images/favicon.png b/images/favicon.png new file mode 100644 index 00000000..6bcded72 Binary files /dev/null and b/images/favicon.png differ diff --git a/images/fsdl-2023-overview.png b/images/fsdl-2023-overview.png new file mode 100644 index 00000000..70b4cb1c Binary files /dev/null and b/images/fsdl-2023-overview.png differ diff --git a/images/full_stack_description.jpeg b/images/full_stack_description.jpeg new file mode 100644 index 00000000..5208e589 Binary files /dev/null and b/images/full_stack_description.jpeg differ diff --git a/images/full_stack_description.png b/images/full_stack_description.png new file mode 100644 index 00000000..90cd04e0 Binary files /dev/null and b/images/full_stack_description.png differ diff --git a/images/group-august2018.jpg b/images/group-august2018.jpg new file mode 100644 index 00000000..47b4e25c Binary files /dev/null and b/images/group-august2018.jpg differ diff --git a/images/group-march2019.jpg b/images/group-march2019.jpg new file mode 100644 index 00000000..74bc1110 Binary files /dev/null and b/images/group-march2019.jpg differ diff --git a/images/group-november2019.jpg b/images/group-november2019.jpg new file mode 100644 index 00000000..decfe5ea Binary files /dev/null and b/images/group-november2019.jpg differ diff --git a/images/josh.png b/images/josh.png new file mode 100644 index 00000000..bd23af11 Binary files /dev/null and b/images/josh.png differ diff --git a/images/llm-bootcamp-group-photo.jpg b/images/llm-bootcamp-group-photo.jpg new file mode 100644 index 00000000..35144fb2 Binary files /dev/null and b/images/llm-bootcamp-group-photo.jpg differ diff --git a/images/llmbc-2023-audience.jpg b/images/llmbc-2023-audience.jpg new file mode 100644 index 00000000..40603654 Binary files /dev/null and b/images/llmbc-2023-audience.jpg differ diff --git a/images/logo-anyscale.png b/images/logo-anyscale.png new file mode 100644 index 00000000..2fabd570 Binary files /dev/null and b/images/logo-anyscale.png differ diff --git a/images/logo-aws.png b/images/logo-aws.png new file mode 100644 index 00000000..548ddb6f Binary files /dev/null and b/images/logo-aws.png differ diff --git a/images/logo-banana.jpg b/images/logo-banana.jpg new file mode 100644 index 00000000..82841b9a Binary files /dev/null and b/images/logo-banana.jpg differ diff --git a/images/logo-bloomberg.png b/images/logo-bloomberg.png new file mode 100644 index 00000000..4eca65e8 Binary files /dev/null and b/images/logo-bloomberg.png differ diff --git a/images/logo-distyl.png b/images/logo-distyl.png new file mode 100644 index 00000000..07d1bac5 Binary files /dev/null and b/images/logo-distyl.png differ diff --git a/images/logo-gantry.jpg b/images/logo-gantry.jpg new file mode 100644 index 00000000..bee3aa9a Binary files /dev/null and b/images/logo-gantry.jpg differ diff --git a/images/logo-lambdalabs.png b/images/logo-lambdalabs.png new file mode 100644 index 00000000..ab32a3f2 Binary files /dev/null and b/images/logo-lambdalabs.png differ diff --git a/images/logo-modal.jpg b/images/logo-modal.jpg new file mode 100644 index 00000000..775c7158 Binary files /dev/null and b/images/logo-modal.jpg differ diff --git a/images/logo-openai.png b/images/logo-openai.png new file mode 100644 index 00000000..05ad62d8 Binary files /dev/null and b/images/logo-openai.png differ diff --git a/images/logo-pinecone.png b/images/logo-pinecone.png new file mode 100644 index 00000000..2c7c2896 Binary files /dev/null and b/images/logo-pinecone.png differ diff --git a/images/logo-replicate.jpg b/images/logo-replicate.jpg new file mode 100644 index 00000000..f37ccf91 Binary files /dev/null and b/images/logo-replicate.jpg differ diff --git a/images/logo-replit.png b/images/logo-replit.png new file mode 100644 index 00000000..a4f66896 Binary files /dev/null and b/images/logo-replit.png differ diff --git a/images/logo-twelvelabs.png b/images/logo-twelvelabs.png new file mode 100644 index 00000000..94cdfd26 Binary files /dev/null and b/images/logo-twelvelabs.png differ diff --git a/images/logo-volition.png b/images/logo-volition.png new file mode 100644 index 00000000..9c4709ed Binary files /dev/null and b/images/logo-volition.png differ diff --git a/images/logo-wandb.jpg b/images/logo-wandb.jpg new file mode 100644 index 00000000..ecbdd180 Binary files /dev/null and b/images/logo-wandb.jpg differ diff --git a/images/logo-you.com.png b/images/logo-you.com.png new file mode 100644 index 00000000..e884c5ba Binary files /dev/null and b/images/logo-you.com.png differ diff --git a/images/logo.key b/images/logo.key new file mode 100644 index 00000000..12b9de35 Binary files /dev/null and b/images/logo.key differ diff --git a/images/logo.png b/images/logo.png new file mode 100644 index 00000000..7fa4c09f Binary files /dev/null and b/images/logo.png differ diff --git a/images/opengraph.key b/images/opengraph.key new file mode 100644 index 00000000..7d83d773 Binary files /dev/null and b/images/opengraph.key differ diff --git a/images/opengraph.png b/images/opengraph.png new file mode 100644 index 00000000..17ea11bd Binary files /dev/null and b/images/opengraph.png differ diff --git a/images/pancakes.png b/images/pancakes.png new file mode 100644 index 00000000..bfbeffc6 Binary files /dev/null and b/images/pancakes.png differ diff --git a/images/pancakes.svg b/images/pancakes.svg new file mode 100644 index 00000000..3037dec4 --- /dev/null +++ b/images/pancakes.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/images/pieter.jpeg b/images/pieter.jpeg new file mode 100644 index 00000000..fb4257b0 Binary files /dev/null and b/images/pieter.jpeg differ diff --git a/images/positioning.png b/images/positioning.png new file mode 100644 index 00000000..dcb9617b Binary files /dev/null and b/images/positioning.png differ diff --git a/images/robots-eating-pancakes/0000.jpg b/images/robots-eating-pancakes/0000.jpg new file mode 100644 index 00000000..ed722e4e Binary files /dev/null and b/images/robots-eating-pancakes/0000.jpg differ diff --git a/images/robots-eating-pancakes/0001.jpg b/images/robots-eating-pancakes/0001.jpg new file mode 100644 index 00000000..995e09b7 Binary files /dev/null and b/images/robots-eating-pancakes/0001.jpg differ diff --git a/images/robots-eating-pancakes/0002.jpg b/images/robots-eating-pancakes/0002.jpg new file mode 100644 index 00000000..d90428c3 Binary files /dev/null and b/images/robots-eating-pancakes/0002.jpg differ diff --git a/images/robots-eating-pancakes/0003.jpg b/images/robots-eating-pancakes/0003.jpg new file mode 100644 index 00000000..ba1ce171 Binary files /dev/null and b/images/robots-eating-pancakes/0003.jpg differ diff --git a/images/robots-eating-pancakes/0004.jpg b/images/robots-eating-pancakes/0004.jpg new file mode 100644 index 00000000..653a824f Binary files /dev/null and b/images/robots-eating-pancakes/0004.jpg differ diff --git a/images/robots-eating-pancakes/0005.jpg b/images/robots-eating-pancakes/0005.jpg new file mode 100644 index 00000000..f4a54ed9 Binary files /dev/null and b/images/robots-eating-pancakes/0005.jpg differ diff --git a/images/robots-eating-pancakes/0006.jpg b/images/robots-eating-pancakes/0006.jpg new file mode 100644 index 00000000..6f4c8a03 Binary files /dev/null and b/images/robots-eating-pancakes/0006.jpg differ diff --git a/images/robots-eating-pancakes/0007.jpg b/images/robots-eating-pancakes/0007.jpg new file mode 100644 index 00000000..0dc65a64 Binary files /dev/null and b/images/robots-eating-pancakes/0007.jpg differ diff --git a/images/robots-eating-pancakes/0008.jpg b/images/robots-eating-pancakes/0008.jpg new file mode 100644 index 00000000..63246963 Binary files /dev/null and b/images/robots-eating-pancakes/0008.jpg differ diff --git a/images/robots-eating-pancakes/0009.jpg b/images/robots-eating-pancakes/0009.jpg new file mode 100644 index 00000000..1fba55b6 Binary files /dev/null and b/images/robots-eating-pancakes/0009.jpg differ diff --git a/images/robots-eating-pancakes/0010.jpg b/images/robots-eating-pancakes/0010.jpg new file mode 100644 index 00000000..15e20049 Binary files /dev/null and b/images/robots-eating-pancakes/0010.jpg differ diff --git a/images/robots-eating-pancakes/0011.jpg b/images/robots-eating-pancakes/0011.jpg new file mode 100644 index 00000000..3f070bf7 Binary files /dev/null and b/images/robots-eating-pancakes/0011.jpg differ diff --git a/images/robots-eating-pancakes/0012.jpg b/images/robots-eating-pancakes/0012.jpg new file mode 100644 index 00000000..b258ae2f Binary files /dev/null and b/images/robots-eating-pancakes/0012.jpg differ diff --git a/images/robots-eating-pancakes/0013.jpg b/images/robots-eating-pancakes/0013.jpg new file mode 100644 index 00000000..604ad3c1 Binary files /dev/null and b/images/robots-eating-pancakes/0013.jpg differ diff --git a/images/robots-eating-pancakes/0014.jpg b/images/robots-eating-pancakes/0014.jpg new file mode 100644 index 00000000..724819ae Binary files /dev/null and b/images/robots-eating-pancakes/0014.jpg differ diff --git a/images/robots-eating-pancakes/0015.jpg b/images/robots-eating-pancakes/0015.jpg new file mode 100644 index 00000000..ae648196 Binary files /dev/null and b/images/robots-eating-pancakes/0015.jpg differ diff --git a/images/robots-eating-pancakes/0016.jpg b/images/robots-eating-pancakes/0016.jpg new file mode 100644 index 00000000..8d3e5d0d Binary files /dev/null and b/images/robots-eating-pancakes/0016.jpg differ diff --git a/images/robots-eating-pancakes/0017.jpg b/images/robots-eating-pancakes/0017.jpg new file mode 100644 index 00000000..9277b6a8 Binary files /dev/null and b/images/robots-eating-pancakes/0017.jpg differ diff --git a/images/robots-eating-pancakes/0018.jpg b/images/robots-eating-pancakes/0018.jpg new file mode 100644 index 00000000..bc0e4239 Binary files /dev/null and b/images/robots-eating-pancakes/0018.jpg differ diff --git a/images/robots-eating-pancakes/0019.jpg b/images/robots-eating-pancakes/0019.jpg new file mode 100644 index 00000000..3c344560 Binary files /dev/null and b/images/robots-eating-pancakes/0019.jpg differ diff --git a/images/robots-eating-pancakes/0020.jpg b/images/robots-eating-pancakes/0020.jpg new file mode 100644 index 00000000..ac62ea8e Binary files /dev/null and b/images/robots-eating-pancakes/0020.jpg differ diff --git a/images/robots-eating-pancakes/0021.jpg b/images/robots-eating-pancakes/0021.jpg new file mode 100644 index 00000000..472ba826 Binary files /dev/null and b/images/robots-eating-pancakes/0021.jpg differ diff --git a/images/robots-eating-pancakes/0022.jpg b/images/robots-eating-pancakes/0022.jpg new file mode 100644 index 00000000..23aa346f Binary files /dev/null and b/images/robots-eating-pancakes/0022.jpg differ diff --git a/images/robots-eating-pancakes/0023.jpg b/images/robots-eating-pancakes/0023.jpg new file mode 100644 index 00000000..5963b77c Binary files /dev/null and b/images/robots-eating-pancakes/0023.jpg differ diff --git a/images/robots-eating-pancakes/0024.jpg b/images/robots-eating-pancakes/0024.jpg new file mode 100644 index 00000000..40ee68de Binary files /dev/null and b/images/robots-eating-pancakes/0024.jpg differ diff --git a/images/robots-eating-pancakes/0025.jpg b/images/robots-eating-pancakes/0025.jpg new file mode 100644 index 00000000..9bd3fcba Binary files /dev/null and b/images/robots-eating-pancakes/0025.jpg differ diff --git a/images/robots-eating-pancakes/0026.jpg b/images/robots-eating-pancakes/0026.jpg new file mode 100644 index 00000000..58ec208f Binary files /dev/null and b/images/robots-eating-pancakes/0026.jpg differ diff --git a/images/robots-eating-pancakes/0027.jpg b/images/robots-eating-pancakes/0027.jpg new file mode 100644 index 00000000..d8cac71a Binary files /dev/null and b/images/robots-eating-pancakes/0027.jpg differ diff --git a/images/robots-eating-pancakes/0028.jpg b/images/robots-eating-pancakes/0028.jpg new file mode 100644 index 00000000..cfc763a2 Binary files /dev/null and b/images/robots-eating-pancakes/0028.jpg differ diff --git a/images/robots-eating-pancakes/0029.jpg b/images/robots-eating-pancakes/0029.jpg new file mode 100644 index 00000000..f013d9cc Binary files /dev/null and b/images/robots-eating-pancakes/0029.jpg differ diff --git a/images/robots-eating-pancakes/0030.jpg b/images/robots-eating-pancakes/0030.jpg new file mode 100644 index 00000000..20ae359a Binary files /dev/null and b/images/robots-eating-pancakes/0030.jpg differ diff --git a/images/sergey.png b/images/sergey.png new file mode 100644 index 00000000..28ee7068 Binary files /dev/null and b/images/sergey.png differ diff --git a/images/speaker-harrison-chase.jpg b/images/speaker-harrison-chase.jpg new file mode 100644 index 00000000..9a2ab961 Binary files /dev/null and b/images/speaker-harrison-chase.jpg differ diff --git a/images/speaker-peter-welinder.jpg b/images/speaker-peter-welinder.jpg new file mode 100644 index 00000000..3e2224f3 Binary files /dev/null and b/images/speaker-peter-welinder.jpg differ diff --git a/images/speaker-reza-shabani.jpg b/images/speaker-reza-shabani.jpg new file mode 100644 index 00000000..aa8c1a80 Binary files /dev/null and b/images/speaker-reza-shabani.jpg differ diff --git a/images/speaker-richard-socher.jpg b/images/speaker-richard-socher.jpg new file mode 100644 index 00000000..c41e1096 Binary files /dev/null and b/images/speaker-richard-socher.jpg differ diff --git a/images/testimonial-00.jpg b/images/testimonial-00.jpg new file mode 100644 index 00000000..77b90e19 Binary files /dev/null and b/images/testimonial-00.jpg differ diff --git a/images/testimonial-01.jpg b/images/testimonial-01.jpg new file mode 100644 index 00000000..5cbaf209 Binary files /dev/null and b/images/testimonial-01.jpg differ diff --git a/images/testimonial-02.jpg b/images/testimonial-02.jpg new file mode 100644 index 00000000..7379b499 Binary files /dev/null and b/images/testimonial-02.jpg differ diff --git a/images/testimonial-03.jpg b/images/testimonial-03.jpg new file mode 100644 index 00000000..c534774e Binary files /dev/null and b/images/testimonial-03.jpg differ diff --git a/images/testimonial-llmbc-00.jpg b/images/testimonial-llmbc-00.jpg new file mode 100644 index 00000000..72362e29 Binary files /dev/null and b/images/testimonial-llmbc-00.jpg differ diff --git a/images/testimonial-llmbc-01.jpg b/images/testimonial-llmbc-01.jpg new file mode 100644 index 00000000..2ff0c050 Binary files /dev/null and b/images/testimonial-llmbc-01.jpg differ diff --git a/images/unknown.png b/images/unknown.png new file mode 100644 index 00000000..7640e017 Binary files /dev/null and b/images/unknown.png differ diff --git a/index.html b/index.html new file mode 100644 index 00000000..01c954ad --- /dev/null +++ b/index.html @@ -0,0 +1,2243 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + + + + + + +
+
+ +
+

+

News, community, and courses for people building AI-powered products.

+


+ + Sign up for our latest course! + +
+
+
+ +
+
+

Building an AI-powered product is much more than just training a model or writing a prompt.

+
+

The Full Stack brings people together to learn and share best practices across the entire lifecycle of an AI-powered product: + from defining the problem and picking a GPU or foundation model to production deployment and continual learning + to user experience design. +

+
+ +
+ +
+
+ +
+

Get up to speed on the latest in AI-powered apps with the new Large Language Models Bootcamp.

+
+

+ Learn best practices and tools for building applications powered by LLMs.

Cover the full stack from prompt engineering and LLMops to user experience design. +

+
+
+
+ +
+ +
+

Build an AI-powered application from the ground up in our Deep Learning Course.

+

+ You've trained your first (or 100th) model, and you're ready to take your skills to the next level. +

+

+ Join thousands from UC Berkeley, + University of Washington, and all over the world + and learn best practices for building AI-powered products from scratch with deep neural networks. +

+
+
+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/javascripts/email_modal.js b/javascripts/email_modal.js new file mode 100644 index 00000000..8633aa21 --- /dev/null +++ b/javascripts/email_modal.js @@ -0,0 +1,85 @@ +/* +Show a modal asking for an email address to see the content, unless the user has already: +- entered an email address (1-year cookie) +- clicked "I'll enter my email later" (12-hour cookie) +- came from a mailing list link, with query param ?code=nBFw6 (1-day cookie) +*/ +document.addEventListener("DOMContentLoaded", function () { + const EMAIL_ADDRESS_COOKIE = "emailAddress"; + const ALREADY_SUBSCRIBER_COOKIE = "laterIntent"; + const MAILING_LIST_COOKIE = "mailingList"; + + function showModal() { + document.getElementById("emailModal").style.display = "block"; + + // Clicking the "I'll enter my email later" link sets a 12-hour cookie until the modal shows again. + document.getElementById("emailModalCloseLink").addEventListener("click", function () { + document.getElementById("emailModal").style.display = "none"; + + var numTimes = parseInt(Cookies.get(ALREADY_SUBSCRIBER_COOKIE) || 0, 10); + if (numTimes > 0) { + Cookies.set(ALREADY_SUBSCRIBER_COOKIE, numTimes + 1, { expires: 365 }); + } else { + Cookies.set(ALREADY_SUBSCRIBER_COOKIE, numTimes + 1, { expires: 1 }); + } + }); + + // Submitting the form sets a 1-year cookie with the email address, and then submits the form to FormSpark. + document.getElementById("emailForm").addEventListener("submit", function (event) { + event.preventDefault(); + document.getElementById("emailModal").style.display = "none"; + + var email = document.getElementById("emailInput").value; + Cookies.set(EMAIL_ADDRESS_COOKIE, email, { expires: 365 }); + + // var turnstileResponseElement = document.getElementsByName('cf-turnstile-resonse')[0]; + // var turnstileResponse = (turnstileResponseElement !== undefined) ? turnstileResponseElement.value : null; + + fetch("https://submit-form.com/ozySmLmG", { + method: "POST", + headers: { + "Content-Type": "application/json", + "Accept": "application/json", + }, + body: JSON.stringify({ + "email": email, + // "cf-turnstile-response": turnstileResponse, + }), + }).then(function (response) { + // console.log(response); + }).catch(function (error) { + console.error(error); + }); + + // Send Google Analytics GA4 event for email address submission in the Spring 2023 LLM Bootcamp modal. + if (typeof gtag !== "undefined") { + gtag('event', 'email_address_submission', { + 'event_category': 'engagement', + 'event_label': 'Spring 2023 LLM Bootcamp', + }); + } + }); + } + + let url = new URL(window.location.href); + + // Check query params for ?code=nBFw6, which is the code for the mailing list, and if present: + // - set a 1-day cookie + // - remove the param from the URL + let params = new URLSearchParams(url.search); + let code = params.get('code'); + if (code === "nBFw6") { + Cookies.set(MAILING_LIST_COOKIE, "true", { expires: 1 }); + params.delete('code'); + url.search = params.toString(); + history.replaceState(null, null, url.toString()); + } + + if ( + url.pathname.startsWith("/llm-bootcamp/spring-2023") && + !(Cookies.get(EMAIL_ADDRESS_COOKIE) || Cookies.get(ALREADY_SUBSCRIBER_COOKIE) || Cookies.get(MAILING_LIST_COOKIE)) + ) { + showModal(); + } + } +); \ No newline at end of file diff --git a/llm-bootcamp/expense/index.html b/llm-bootcamp/expense/index.html new file mode 100644 index 00000000..fcc0ab8b --- /dev/null +++ b/llm-bootcamp/expense/index.html @@ -0,0 +1,2235 @@ + + + + + + + + + + + + + + + + + + + + + + + + + How do I expense the bootcamp? - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + + + + +

How do I expense the bootcamp?

+

Submitting a receipt

+

Upon registration, you'll receive an email from our event provider with your ticket +and your order details, including the last 4 digits of the credit card you used, if you used one.

+

This is often sufficient documentation.

+

Verification of attendance

+

If you need your attendance verified, +please email us at admin@fullstackdeeplearning.com +after the event is finished and indicate the address to which we should send a verification email.

+

Requesting approval

+

Some organizations require prior approval for education expenses from a manager.

+

In that case, you can work off of the email template below. +We suggest you customize it to your team and its needs.

+

Keep in mind that you want to make sure that it's clear to your manager +why sending you to this bootcamp is in the company's interest, +not just yours!

+
+

An email template, co-written with ChatGPT

+

Dear {manager},

+

I wanted to bring to your attention a bootcamp that I would love to enroll in. Full Stack Deep Learning is hosting a two-day program that covers the emerging field of application development with Large Language Models (LLMs). FSDL is a respected name that has delivered educational material on productionizing machine learning in formats from large MOOCs to courses at top universities like UC Berkeley and UW.

+

This bootcamp will provide me with the knowledge and skills necessary to build and deploy LLM applications and stay up-to-date with the state-of-the-art in the industry.

+

Some of the benefits:

+
    +
  • Coverage of both conceptual fundamentals and concrete engineering practices
  • +
  • Talks from leaders in the field, like Harrison Chase of LangChain
  • +
  • Opportunities to network with builders
  • +
+

I expect to be able to bring back my learnings and apply them directly to my work and share them with the team.

+

The cost of the bootcamp is {price}. I strongly believe that this investment in my education will benefit the company in the long run.

+

If you are interested, you can find more details about the bootcamp, including the instructors and their bios, at the following link: https://fsdl.me/2023-llmbc-landing.

+

Thank you for considering my request.

+

Best,

+

{name}

+
+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/index.html b/llm-bootcamp/index.html new file mode 100644 index 00000000..a5c5bc8e --- /dev/null +++ b/llm-bootcamp/index.html @@ -0,0 +1,2345 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LLM Bootcamp - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Full Stack LLM Bootcamp

+
+

🚀 Full Stack LLM Bootcamp 🚀

+
+
    +
  • Learn best practices and tools for building LLM-powered apps
  • +
  • Cover the full stack from prompt engineering to user-centered design +
  • Get up to speed on the state-of-the-art
  • +
+
+ Venn diagram showing that FSDL is at the intersection of a course, a hackathon, and a conference. +
+
+ +
+ +

Testimonials

+ + +

Why

+

The way AI-powered apps are built has changed:

+
    +
  • Before LLMs, an idea would bottleneck on training models from scratch, and then it'd bottleneck again on scalable deployment.
  • +
  • Now, a compelling MVP based on pretrained LLM models and APIs can be configured and serving users in an hour.
  • +
+

An entirely new ecosystem of techniques, tools, and tool vendors is forming around LLMs. +Even ML veterans are scrambling to orient themselves to what is now possible and figure out the most productive techniques and tools.

+

What

+

We put together a two-day program based on emerging best practices and the latest research results to help you make the transition to building LLM apps with confidence.

+

We ran that program as an in-person bootcamp in San Francisco in April 2023. +Now, we're releasing the recorded lectures, for free!

+ +
+

What do I need to know already?

+

The lectures aim to get anyone with experience programming in Python ready to start building applications that use LLMs.

+

Experience with at least one of machine learning, frontend, or backend will be very helpful.

+
+

Who

+
+ Image of the instructors and attendees at the 2023 FSDL LLM Bootcamp +
+ +

We are Full Stack Deep Learning. +We're a team of UC Berkeley PhD alumni with years of industry experience who are passionate about teaching people how to make deep neural networks work in the real world.

+

Since 2018, we have taught in-person bootcamps, online multi-week cohorts, and official semester-long courses at top universities.

+
+ Group photo of the attendees of FSDL March 2019 bootcamp + Group photo of the attendees of FSDL August 2018 bootcamp + Group photo of the attendees of FSDL November 2019 bootcamp +
+ +

Instructor Team

+
+
+ + Photo of Charles Frye + +
Charles Frye + educates people in AI. He has worked on AI/ML tooling with Weights & Biases and Gantry since getting a PhD in Theoretical Neuroscience at UC Berkeley.
+
+
+ + Photo of Sergey Karayev + +
+ Sergey Karayev + builds AI-powered products as Co-founder of Volition. He co-founded Gradescope after getting a PhD in AI at UC Berkeley. +
+
+
+ + Photo of Josh Tobin + +
+ + Josh Tobin + + builds tooling for AI products as Co-founder and CEO of Gantry. He worked as a Research Scientist at OpenAI and received a PhD in AI at UC Berkeley. +
+
+
+ +

+

If you have any questions about the bootcamp materials, contact +admin @ fullstackdeeplearning.com.

+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/opengraph.key b/llm-bootcamp/opengraph.key new file mode 100644 index 00000000..20664203 Binary files /dev/null and b/llm-bootcamp/opengraph.key differ diff --git a/llm-bootcamp/opengraph.png b/llm-bootcamp/opengraph.png new file mode 100644 index 00000000..9e96ef9d Binary files /dev/null and b/llm-bootcamp/opengraph.png differ diff --git a/llm-bootcamp/sponsors/index.html b/llm-bootcamp/sponsors/index.html new file mode 100644 index 00000000..1caf9aea --- /dev/null +++ b/llm-bootcamp/sponsors/index.html @@ -0,0 +1,2182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Info for Sponsors - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + + + + +

Info for Sponsors

+

We offer three tiers of sponsorship for the FSDL 2023 LLM Bootcamp:

+
    +
  1. +

    Vector Tier sponsors receive the following benefits: logo displayed on website and during conference, verbal acknowledgement. Vector Tier sponsorships are available for $1500.

    +
  2. +
  3. +

    Matrix Tier sponsors receive all the benefits of the Vector Tier, plus: logo displayed in between talks, 6' table with two seats, and a conference registration. Matrix Tier sponsorships are available for $4500.

    +
  4. +
  5. +

    Tensor Tier sponsors receive all the benefits of the Matrix Tier, plus: logo displayed on a banner in the registration area, access to an opt-in database of attendees with information about job and job-seeking status, and two additional registrations (for a total of three). Tensor Tier sponsorships are available for $10000.

    +
  6. +
+

Contact +sponsorships@fullstackdeeplearning.com +if you're interested in sponsoring the conference!

+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_0.jpg b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_0.jpg new file mode 100644 index 00000000..cb3efaad Binary files /dev/null and b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_1.jpg b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_1.jpg new file mode 100644 index 00000000..cfc6cdd5 Binary files /dev/null and b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_2.jpg b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_2.jpg new file mode 100644 index 00000000..179ca0dc Binary files /dev/null and b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_3.jpg b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_3.jpg new file mode 100644 index 00000000..aba0bdb6 Binary files /dev/null and b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_4.jpg b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_4.jpg new file mode 100644 index 00000000..e25fa54c Binary files /dev/null and b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_5.jpg b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_5.jpg new file mode 100644 index 00000000..a81127ea Binary files /dev/null and b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/index.html b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/index.html new file mode 100644 index 00000000..2f4b4b87 --- /dev/null +++ b/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/index.html @@ -0,0 +1,2298 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

SWE Tooling: make, precommit, etc

+

Chapter 0 Cover Image

+
    +
  • Walked everyone through the code base for the Discord bot they interacted with
  • +
  • Sourced question-answering over a corpus of information using Vector storage for retrieval
  • +
  • GitHub repo available for this project, but may not be able to execute the code without accounts on all services
  • +
  • Makefile created for easier project management, setting up environment and authentication, and running setup commands
  • +
  • Incorporated software tools like pre-commit checks, black for Python auto-formatting, and rust-powered formatter
  • +
  • Shell check tool useful for catching issues in bash scripts
  • +
+

Data Cleaning

+

Chapter 1 Cover Image

+
    +
  • Initial approach of scraping data and chunking into smaller pieces did not yield good results
  • +
  • Improved results by spending time understanding the data and preserving the structure during processing
  • +
  • Extracting textual information from other sources like images and YouTube videos can enhance the usefulness of language models
  • +
  • Sometimes simple solutions to specific data sources and problems can greatly improve the quality of results
  • +
  • The unglamorous work of getting to know the data and writing code to manage it properly can result in big dividends for language model applications
  • +
+

Infrastructure: Modal

+

Chapter 2 Cover Image

+
    +
  • Discussed the ETL component of extracting, transforming, and loading data from various sources
  • +
  • Discussed using Python packages for data transformation and addressing dependency issues with tools like pre-commit
  • +
  • Explained the benefits of the modal component in creating lightweight virtual containers for different tasks
  • +
  • Modal containers are fast and efficient, aiding in quick development cycles and allowing for containerization without the pains of traditional Docker images
  • +
  • Modal also allows for the creation of serverless applications with auto-scaling and resource management
  • +
  • Debugging and local development can be done through the interactive mode by connecting to a container running on modal
  • +
  • showModal provides an interface for tracking application activity, utilization, and resource allocation, making it a versatile tool for various projects
  • +
+

Frontend: Gradio & Discord

+

Chapter 3 Cover Image

+
    +
  • Introduced Gradio user interface, allowing users to create interfaces in pure Python
  • +
  • Gradio UI is flexible, supported by Hugging Face, and rapidly adopting machine learning features
  • +
  • Examples of Gradio UI use include Alpaca, Flamingo, and Dolly mini
  • +
  • Gradio UI is easy to set up, portable, flexible, and comes with an API with OpenAPI spec
  • +
  • Discord bot integrated with Python library Discord.py; alternative library Interactions.py is also available
  • +
  • Gradio UI is built on FastAPI for asynchronous Python web service
  • +
  • Application mainly runs on the model's infrastructure in containers, serving traffic as needed
  • +
+

Embeddings & ETL

+

Chapter 4 Cover Image

+
    +
  • Used OpenAI's ada002 model to generate embeddings, which are much cheaper than generation endpoints
  • +
  • Currently using a vector index for data storage, but considering adding additional types of search
  • +
  • Discussed processing PDFs in a previous lecture, mentioned using local code to extract URLs and using a map function with controlled concurrency
  • +
  • Retrieval results are put into the zero-shot problem using an F-string template in LangChain's prompt template
  • +
  • Compared LangChain to Hugging Face Transformers Library as a framework and mentioned that their code is often simple, but valuable for its interface and compatibility with other tools
  • +
+

Monitoring & Improvement: Gantry

+

Chapter 5 Cover Image

+
    +
  • Top three challenges in bringing the spot to the next level: improving retrieval, improving the quality of model outputs, and identifying a solid user base.
  • +
  • Using tools like Datadog, Sentry, Honeycomb, and Gantry for handling web services, logging, and monitoring model behavior.
  • +
  • The same principle of tracing and monitoring applies to both ML-powered apps and LLM-powered apps.
  • +
  • Gantry provides a useful service for tracking and enriching logged data, including toxicity checks and other natural language-based or numerical analyses.
  • +
  • Using language models to check on the performance and outputs of other language models.
  • +
  • Contributing to the development of the tool as a teaching and learning application is open and encouraged.
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/cover.jpg b/llm-bootcamp/spring-2023/askfsdl-walkthrough/cover.jpg new file mode 100644 index 00000000..f0f481fd Binary files /dev/null and b/llm-bootcamp/spring-2023/askfsdl-walkthrough/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/askfsdl-walkthrough/index.html b/llm-bootcamp/spring-2023/askfsdl-walkthrough/index.html new file mode 100644 index 00000000..bc990f81 --- /dev/null +++ b/llm-bootcamp/spring-2023/askfsdl-walkthrough/index.html @@ -0,0 +1,2426 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Project Walkthrough: askFSDL - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Project Walkthrough: askFSDL

+
+ +
+ +

Project by Charles Frye. +Published May 9, 2023.

+

View the project repository.

+

Interact with the bot on our Discord.

+

Chapter Summaries

+

SWE Tooling: make, precommit, etc

+

Chapter 0 Cover Image

+
    +
  • Walked everyone through the code base for the Discord bot they interacted with
  • +
  • Sourced question-answering over a corpus of information using Vector storage for retrieval
  • +
  • GitHub repo available for this project, but may not be able to execute the code without accounts on all services
  • +
  • Makefile created for easier project management, setting up environment and authentication, and running setup commands
  • +
  • Incorporated software tools like pre-commit checks, black for Python auto-formatting, and rust-powered formatter
  • +
  • Shell check tool useful for catching issues in bash scripts
  • +
+

Data Cleaning

+

Chapter 1 Cover Image

+
    +
  • Initial approach of scraping data and chunking into smaller pieces did not yield good results
  • +
  • Improved results by spending time understanding the data and preserving the structure during processing
  • +
  • Extracting textual information from other sources like images and YouTube videos can enhance the usefulness of language models
  • +
  • Sometimes simple solutions to specific data sources and problems can greatly improve the quality of results
  • +
  • The unglamorous work of getting to know the data and writing code to manage it properly can result in big dividends for language model applications
  • +
+

Infrastructure: Modal

+

Chapter 2 Cover Image

+
    +
  • Discussed the ETL component of extracting, transforming, and loading data from various sources
  • +
  • Discussed using Python packages for data transformation and addressing dependency issues with tools like pre-commit
  • +
  • Explained the benefits of the modal component in creating lightweight virtual containers for different tasks
  • +
  • Modal containers are fast and efficient, aiding in quick development cycles and allowing for containerization without the pains of traditional Docker images
  • +
  • Modal also allows for the creation of serverless applications with auto-scaling and resource management
  • +
  • Debugging and local development can be done through the interactive mode by connecting to a container running on modal
  • +
  • showModal provides an interface for tracking application activity, utilization, and resource allocation, making it a versatile tool for various projects
  • +
+

Frontend: Gradio & Discord

+

Chapter 3 Cover Image

+
    +
  • Introduced Gradio user interface, allowing users to create interfaces in pure Python
  • +
  • Gradio UI is flexible, supported by Hugging Face, and rapidly adopting machine learning features
  • +
  • Examples of Gradio UI use include Alpaca, Flamingo, and Dolly mini
  • +
  • Gradio UI is easy to set up, portable, flexible, and comes with an API with OpenAPI spec
  • +
  • Discord bot integrated with Python library Discord.py; alternative library Interactions.py is also available
  • +
  • Gradio UI is built on FastAPI for asynchronous Python web service
  • +
  • Application mainly runs on the model's infrastructure in containers, serving traffic as needed
  • +
+

Embeddings & ETL

+

Chapter 4 Cover Image

+
    +
  • Used OpenAI's ada002 model to generate embeddings, which are much cheaper than generation endpoints
  • +
  • Currently using a vector index for data storage, but considering adding additional types of search
  • +
  • Discussed processing PDFs in a previous lecture, mentioned using local code to extract URLs and using a map function with controlled concurrency
  • +
  • Retrieval results are put into the zero-shot problem using an F-string template in LangChain's prompt template
  • +
  • Compared LangChain to Hugging Face Transformers Library as a framework and mentioned that their code is often simple, but valuable for its interface and compatibility with other tools
  • +
+

Monitoring & Improvement: Gantry

+

Chapter 5 Cover Image

+
    +
  • Top three challenges in bringing the spot to the next level: improving retrieval, improving the quality of model outputs, and identifying a solid user base.
  • +
  • Using tools like Datadog, Sentry, Honeycomb, and Gantry for handling web services, logging, and monitoring model behavior.
  • +
  • The same principle of tracing and monitoring applies to both ML-powered apps and LLM-powered apps.
  • +
  • Gantry provides a useful service for tracking and enriching logged data, including toxicity checks and other natural language-based or numerical analyses.
  • +
  • Using language models to check on the performance and outputs of other language models.
  • +
  • Contributing to the development of the tool as a teaching and learning application is open and encouraged.
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_0.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_0.jpg new file mode 100644 index 00000000..89b80c4e Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_1.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_1.jpg new file mode 100644 index 00000000..f4c4aa7f Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_10.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_10.jpg new file mode 100644 index 00000000..1815768a Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_10.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_11.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_11.jpg new file mode 100644 index 00000000..b1d6942d Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_11.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_12.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_12.jpg new file mode 100644 index 00000000..e9d1aa62 Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_12.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_13.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_13.jpg new file mode 100644 index 00000000..f46b44fd Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_13.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_2.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_2.jpg new file mode 100644 index 00000000..1a1adec7 Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_3.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_3.jpg new file mode 100644 index 00000000..869324fd Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_4.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_4.jpg new file mode 100644 index 00000000..d43021e8 Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_5.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_5.jpg new file mode 100644 index 00000000..e90c911d Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_6.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_6.jpg new file mode 100644 index 00000000..6545f50e Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_7.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_7.jpg new file mode 100644 index 00000000..ecff9e3f Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_8.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_8.jpg new file mode 100644 index 00000000..0f8c121c Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_8.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_9.jpg b/llm-bootcamp/spring-2023/augmented-language-models/chapter_9.jpg new file mode 100644 index 00000000..615204e8 Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/chapter_9.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/index.html b/llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/index.html new file mode 100644 index 00000000..4f68660c --- /dev/null +++ b/llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/index.html @@ -0,0 +1,2459 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Why augmented LMs?

+

Chapter 0 Cover Image

+
    +
  • Language models are good at understanding language, following instructions, basic reasoning, and understanding code, but they lack up-to-date knowledge, specifics about your data, and more complex reasoning abilities.
  • +
  • Think of language models as the "brain" that needs tools and data to complete tasks.
  • +
  • Context windows are limited but growing rapidly and putting more context in the model costs money.
  • +
  • There are three ways to augment language models: retrieval, chains, and tools.
  • +
  • Retrieval involves providing an external corpus of data for the model to search, chains use the output of one language model as input for another, and tools allow models to interact with external data sources.
  • +
  • This lecture serves as an introduction to these topics with depth available for further exploration.
  • +
+

Why retrieval augmentation?

+

Chapter 1 Cover Image

+
    +
  • Discussing retrieval augmentation to give models access to user-specific data
  • +
  • Initial approach: put data into the context (e.g., organizers of an event)
  • +
  • Challenge: thousands of users and complex relationships between queries and users make it difficult to use simple rules/coding
  • +
  • Consider building the context as a form of information retrieval (like search)
  • +
  • Treat putting the right data in the context for the model as a search problem
  • +
+

Traditional information retrieval

+

Chapter 2 Cover Image

+
    +
  • Traditional information retrieval uses a query to find and rank relevant objects in a collection
  • +
  • Objects can be documents, images, or other types of content
  • +
  • Inverted indexes, which record word frequencies in documents, are often used for search
  • +
  • Relevance is typically determined through Boolean search, while ranking is commonly done using the BM25 algorithm
  • +
  • Factors affecting ranking include search term frequency in the document, number of documents containing the search term, and context within a sentence
  • +
  • Traditional search is limited as it cannot capture semantic information or complex relationships between terms
  • +
+

Embeddings for retrieval

+

Chapter 3 Cover Image

+
    +
  • Discussing AI-centric approach for information retrieval via embeddings.
  • +
  • AI helps improve search and retrieve better data from contexts using large language models and embeddings.
  • +
  • Embeddings are abstract, dense, compact, usually fixed-size, and learned representations of data, which could be documents, images, audio, etc.
  • +
  • Good embeddings have utility for the downstream task, and similar objects should be close together in the embedding space, while different objects should be far apart.
  • +
  • Important embeddings to know: Word2Vec, Sentence Transformers, CLIP, OpenAI embeddings (Text Embedding ada002), and Instructor.
  • +
  • Off-the-shelf embeddings are a good start, but fine-tuning and training an embedding model on specific tasks can achieve better results.
  • +
+

Embedding relevance and indexes

+

Chapter 4 Cover Image

+
    +
  • Discussing using embeddings for information retrieval
  • +
  • Can use cosine similarity or dot product similarity as similarity metrics
  • +
  • For nearest neighbor search, can simply use numpy if dealing with less than 100,000 vectors
  • +
  • Approximate nearest neighbor algorithms are useful for faster search at larger scales, with tools like Facebook AI's FAISS, HNSW, and Annoy
  • +
  • Choosing an information retrieval system is more important than the specific embedding index
  • +
  • Limitations of approximate nearest neighbor indices include lack of hosting, data and metadata storage, and scalability
  • +
  • Consider an information retrieval system that addresses these limitations for production use, analogous to having a complete library rather than just a card catalog
  • +
+

Embedding databases

+

Chapter 5 Cover Image

+
    +
  • Searching over vectors may not be great for production, so consider databases for a more reliable and production-oriented approach.
  • +
  • Consider whether you need an embedding database or just a database, as many popular databases already have embedding index built in, such as PG Vector for Postgres, Elasticsearch, and Redis.
  • +
  • Building a system for information retrieval with embeddings involves challenges like scale, reliability, managing the embedding function, specifying queries, and choosing search algorithms.
  • +
  • Don't try to handle all the complexity yourself; use existing embedding databases like Chroma, Milvus, Pinecone, Vespa, and Weaviate.
  • +
  • When choosing an embedding database, consider features like scalability, embedding management, filtering, and integration with traditional full-text search.
  • +
  • General recommendations: use your existing database for prototyping, choose Pinecone for speed of setup, consider Vespa and Weaviate for flexible queries, and Vespa and Milvus for scale and reliability.
  • +
+

Beyond naive embeddings

+

Chapter 6 Cover Image

+
    +
  • Address issues when queries and documents have different forms and embeddings aren't comparable
  • +
  • Consider training a model that jointly represents both queries and documents for a more "apples to apples" comparison
  • +
  • Explore hypothetical document embeddings: have the model imagine a document containing the query's answer and find similar documents
  • +
  • Look into re-ranking techniques: search a large number of documents and train a model to reorder them based on specific criteria
  • +
  • Use new libraries like Lama Index to search more efficiently, respecting the structure of the data and subsets (e.g., Notion database, Twitter, or recent data)
  • +
  • Lama Index combines document retrieval and building embeddings designed for hierarchical searching
  • +
+

Patterns & case studies

+

Chapter 7 Cover Image

+
    +
  • Retrieval augmentation case study: Copilot
  • +
  • Two secrets to Copilot: speed and relevant context
  • +
  • Builds context by looking at most recently accessed 20 documents in the same programming language
  • +
  • Post-processing includes looking at code before and after cursor, relevant snippets from candidate docs, and heuristically accessing data
  • +
  • Output generated is a result of sorting by heuristics
  • +
  • Copilot is powerful but uses simple retrieval methods, highlighting the effectiveness of heuristics
  • +
  • Another common pattern: question answering using retrieval augmentation
  • +
  • This involves finding most similar documents/messages to a question and using retrieved information to answer the question
  • +
  • Limitation: search process might not return the documents containing the answer
  • +
  • Solution: use more models and iterate over documents, calling an LLM on each subset and feeding the output to the next model
  • +
  • This approach can be generalized as "chains" where models build context for other models
  • +
+

What are chains and why do we need them?

+

Chapter 8 Cover Image

+
    +
  • Discussing ways to add information to the context for language models besides retrieval
  • +
  • Retrieval-based models follow a common question-answering pattern: embedding queries, comparing embeddings to find similar documents, and using context to answer questions
  • +
  • Key limitation: reliance on the retrieval system; if the right information isn't among the retrieved documents, the model can't answer the question
  • +
  • Possible solutions:
  • +
  • Improve the quality of the information retrieval system with advanced search features
  • +
  • Add additional processing, like using another LLM for post-processing retrieved documents, to refine the context (although it might be slower and more expensive)
  • +
  • Introducing the concept of "chains": sequencing language model calls where the output of one call is the input to another
  • +
  • Example patterns for building chains:
  • +
  • Question-answering pattern
  • +
  • Hypothetical document embeddings
  • +
  • Summarizing a large corpus through a mapreduce-like process by independently summarizing each document, then summarizing the summaries
  • +
+

LangChain

+

Chapter 9 Cover Image

+
    +
  • Lang chain is an extremely popular tool for building chains of models and one of the fastest growing open source projects
  • +
  • Supports both Python and JavaScript
  • +
  • Fastest way to get started building applications and can be used in production
  • +
  • Many people end up creating their own chaining, possibly inspired by Lang chain
  • +
  • Lang chain provides a repository of different chains for various tasks and offers nice code and abstractions
  • +
  • Ideal for prototyping, but also easy to build your own system if needed for production
  • +
  • Contains many examples of types of chains in their repository, which is useful for generating ideas and learning about chaining patterns
  • +
+

Tool use

+

Chapter 10 Cover Image

+
    +
  • Building context for language models to answer questions can involve creating a search engine or giving them access to APIs and outside tools
  • +
  • A "feeling lucky" chain involves searching Google for an answer, getting the top result, and summarizing the content for the user
  • +
  • Tool_Former paper demonstrates using tools such as calculators, question-answering systems, and translation systems in the training process for language models
  • +
  • Tools can be used deterministically or in a way similar to OpenAI plugins
  • +
  • Examples of tools for language models include archive search, Python interpreters, and SQL query execution
  • +
  • An example chain involves translating a user's natural language question into an SQL query, executing the query, and providing the response back to the user
  • +
+

Plugins

+

Chapter 11 Cover Image

+
    +
  • There is a more automated approach called plugins to allow language models to interact with external tools.
  • +
  • In a chain-based approach, developers manually design the interaction pattern between language model and tool by passing queries through a series of steps.
  • +
  • In a plugin-based approach, the language model gets to decide whether to use a tool or not. A simpler method is used in Tool Former and OpenAI plugins.
  • +
  • To create an OpenAI plugin, provide an API spec and a description of the API meant for the language model to decide when to use it.
  • +
  • OpenAI passes the description as part of the context to the model, enabling it to make decisions based on user inputs and the API's usefulness.
  • +
  • The model can invoke the API, and results are fed into the context allowing the language model to continue answering user questions.
  • +
+

Recommendations for tool use

+

Chapter 12 Cover Image

+
    +
  • Tools are a flexible way to augment language models with external data.
  • +
  • Retrieval systems are one example of a tool that can be based on various databases.
  • +
  • Two ways to build tool use into language models: manually describe the logic (chains) or use plugins and let the model figure it out.
  • +
  • Chains are better for reliability and consistent problem-solving.
  • +
  • Plugins are more suitable for interactivity, flexibility, and general-purpose use, allowing users to solve various unanticipated problems.
  • +
+

Recap & conclusions

+

Chapter 13 Cover Image

+
    +
  • LMS are more powerful when connected to external data
  • +
  • Rules and heuristics can help identify useful data
  • +
  • As knowledge base scales, consider it as information retrieval
  • +
  • Chains can encode complex reasoning and help with token limits
  • +
  • Tools can provide access to external knowledge beyond internal database
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/augmented-language-models/cover.jpg b/llm-bootcamp/spring-2023/augmented-language-models/cover.jpg new file mode 100644 index 00000000..70bada49 Binary files /dev/null and b/llm-bootcamp/spring-2023/augmented-language-models/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/augmented-language-models/index.html b/llm-bootcamp/spring-2023/augmented-language-models/index.html new file mode 100644 index 00000000..e706ed55 --- /dev/null +++ b/llm-bootcamp/spring-2023/augmented-language-models/index.html @@ -0,0 +1,2658 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Augmented Language Models - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Augmented Language Models

+
+ +
+ +

Lecture by Josh Tobin. +Published May 9, 2023. +Download slides.

+

Chapter Summaries

+

Why augmented LMs?

+

Chapter 0 Cover Image

+
    +
  • Language models are good at understanding language, following instructions, basic reasoning, and understanding code, but they lack up-to-date knowledge, specifics about your data, and more complex reasoning abilities.
  • +
  • Think of language models as the "brain" that needs tools and data to complete tasks.
  • +
  • Context windows are limited but growing rapidly and putting more context in the model costs money.
  • +
  • There are three ways to augment language models: retrieval, chains, and tools.
  • +
  • Retrieval involves providing an external corpus of data for the model to search, chains use the output of one language model as input for another, and tools allow models to interact with external data sources.
  • +
  • This lecture serves as an introduction to these topics with depth available for further exploration.
  • +
+

Why retrieval augmentation?

+

Chapter 1 Cover Image

+
    +
  • Discussing retrieval augmentation to give models access to user-specific data
  • +
  • Initial approach: put data into the context (e.g., organizers of an event)
  • +
  • Challenge: thousands of users and complex relationships between queries and users make it difficult to use simple rules/coding
  • +
  • Consider building the context as a form of information retrieval (like search)
  • +
  • Treat putting the right data in the context for the model as a search problem
  • +
+

Traditional information retrieval

+

Chapter 2 Cover Image

+
    +
  • Traditional information retrieval uses a query to find and rank relevant objects in a collection
  • +
  • Objects can be documents, images, or other types of content
  • +
  • Inverted indexes, which record word frequencies in documents, are often used for search
  • +
  • Relevance is typically determined through Boolean search, while ranking is commonly done using the BM25 algorithm
  • +
  • Factors affecting ranking include search term frequency in the document, number of documents containing the search term, and context within a sentence
  • +
  • Traditional search is limited as it cannot capture semantic information or complex relationships between terms
  • +
+

Embeddings for retrieval

+

Chapter 3 Cover Image

+
    +
  • Discussing AI-centric approach for information retrieval via embeddings.
  • +
  • AI helps improve search and retrieve better data from contexts using large language models and embeddings.
  • +
  • Embeddings are abstract, dense, compact, usually fixed-size, and learned representations of data, which could be documents, images, audio, etc.
  • +
  • Good embeddings have utility for the downstream task, and similar objects should be close together in the embedding space, while different objects should be far apart.
  • +
  • Important embeddings to know: Word2Vec, Sentence Transformers, CLIP, OpenAI embeddings (Text Embedding ada002), and Instructor.
  • +
  • Off-the-shelf embeddings are a good start, but fine-tuning and training an embedding model on specific tasks can achieve better results.
  • +
+

Embedding relevance and indexes

+

Chapter 4 Cover Image

+
    +
  • Discussing using embeddings for information retrieval
  • +
  • Can use cosine similarity or dot product similarity as similarity metrics
  • +
  • For nearest neighbor search, can simply use numpy if dealing with less than 100,000 vectors
  • +
  • Approximate nearest neighbor algorithms are useful for faster search at larger scales, with tools like Facebook AI's FAISS, HNSW, and Annoy
  • +
  • Choosing an information retrieval system is more important than the specific embedding index
  • +
  • Limitations of approximate nearest neighbor indices include lack of hosting, data and metadata storage, and scalability
  • +
  • Consider an information retrieval system that addresses these limitations for production use, analogous to having a complete library rather than just a card catalog
  • +
+

Embedding databases

+

Chapter 5 Cover Image

+
    +
  • Searching over vectors may not be great for production, so consider databases for a more reliable and production-oriented approach.
  • +
  • Consider whether you need an embedding database or just a database, as many popular databases already have embedding index built in, such as PG Vector for Postgres, Elasticsearch, and Redis.
  • +
  • Building a system for information retrieval with embeddings involves challenges like scale, reliability, managing the embedding function, specifying queries, and choosing search algorithms.
  • +
  • Don't try to handle all the complexity yourself; use existing embedding databases like Chroma, Milvus, Pinecone, Vespa, and Weaviate.
  • +
  • When choosing an embedding database, consider features like scalability, embedding management, filtering, and integration with traditional full-text search.
  • +
  • General recommendations: use your existing database for prototyping, choose Pinecone for speed of setup, consider Vespa and Weaviate for flexible queries, and Vespa and Milvus for scale and reliability.
  • +
+

Beyond naive embeddings

+

Chapter 6 Cover Image

+
    +
  • Address issues when queries and documents have different forms and embeddings aren't comparable
  • +
  • Consider training a model that jointly represents both queries and documents for a more "apples to apples" comparison
  • +
  • Explore hypothetical document embeddings: have the model imagine a document containing the query's answer and find similar documents
  • +
  • Look into re-ranking techniques: search a large number of documents and train a model to reorder them based on specific criteria
  • +
  • Use new libraries like Lama Index to search more efficiently, respecting the structure of the data and subsets (e.g., Notion database, Twitter, or recent data)
  • +
  • Lama Index combines document retrieval and building embeddings designed for hierarchical searching
  • +
+

Patterns & case studies

+

Chapter 7 Cover Image

+
    +
  • Retrieval augmentation case study: Copilot
  • +
  • Two secrets to Copilot: speed and relevant context
  • +
  • Builds context by looking at most recently accessed 20 documents in the same programming language
  • +
  • Post-processing includes looking at code before and after cursor, relevant snippets from candidate docs, and heuristically accessing data
  • +
  • Output generated is a result of sorting by heuristics
  • +
  • Copilot is powerful but uses simple retrieval methods, highlighting the effectiveness of heuristics
  • +
  • Another common pattern: question answering using retrieval augmentation
  • +
  • This involves finding most similar documents/messages to a question and using retrieved information to answer the question
  • +
  • Limitation: search process might not return the documents containing the answer
  • +
  • Solution: use more models and iterate over documents, calling an LLM on each subset and feeding the output to the next model
  • +
  • This approach can be generalized as "chains" where models build context for other models
  • +
+

What are chains and why do we need them?

+

Chapter 8 Cover Image

+
    +
  • Discussing ways to add information to the context for language models besides retrieval
  • +
  • Retrieval-based models follow a common question-answering pattern: embedding queries, comparing embeddings to find similar documents, and using context to answer questions
  • +
  • Key limitation: reliance on the retrieval system; if the right information isn't among the retrieved documents, the model can't answer the question
  • +
  • Possible solutions:
  • +
  • Improve the quality of the information retrieval system with advanced search features
  • +
  • Add additional processing, like using another LLM for post-processing retrieved documents, to refine the context (although it might be slower and more expensive)
  • +
  • Introducing the concept of "chains": sequencing language model calls where the output of one call is the input to another
  • +
  • Example patterns for building chains:
  • +
  • Question-answering pattern
  • +
  • Hypothetical document embeddings
  • +
  • Summarizing a large corpus through a mapreduce-like process by independently summarizing each document, then summarizing the summaries
  • +
+

LangChain

+

Chapter 9 Cover Image

+
    +
  • Lang chain is an extremely popular tool for building chains of models and one of the fastest growing open source projects
  • +
  • Supports both Python and JavaScript
  • +
  • Fastest way to get started building applications and can be used in production
  • +
  • Many people end up creating their own chaining, possibly inspired by Lang chain
  • +
  • Lang chain provides a repository of different chains for various tasks and offers nice code and abstractions
  • +
  • Ideal for prototyping, but also easy to build your own system if needed for production
  • +
  • Contains many examples of types of chains in their repository, which is useful for generating ideas and learning about chaining patterns
  • +
+

Tool use

+

Chapter 10 Cover Image

+
    +
  • Building context for language models to answer questions can involve creating a search engine or giving them access to APIs and outside tools
  • +
  • A "feeling lucky" chain involves searching Google for an answer, getting the top result, and summarizing the content for the user
  • +
  • Tool_Former paper demonstrates using tools such as calculators, question-answering systems, and translation systems in the training process for language models
  • +
  • Tools can be used deterministically or in a way similar to OpenAI plugins
  • +
  • Examples of tools for language models include archive search, Python interpreters, and SQL query execution
  • +
  • An example chain involves translating a user's natural language question into an SQL query, executing the query, and providing the response back to the user
  • +
+

Plugins

+

Chapter 11 Cover Image

+
    +
  • There is a more automated approach called plugins to allow language models to interact with external tools.
  • +
  • In a chain-based approach, developers manually design the interaction pattern between language model and tool by passing queries through a series of steps.
  • +
  • In a plugin-based approach, the language model gets to decide whether to use a tool or not. A simpler method is used in Tool Former and OpenAI plugins.
  • +
  • To create an OpenAI plugin, provide an API spec and a description of the API meant for the language model to decide when to use it.
  • +
  • OpenAI passes the description as part of the context to the model, enabling it to make decisions based on user inputs and the API's usefulness.
  • +
  • The model can invoke the API, and results are fed into the context allowing the language model to continue answering user questions.
  • +
+

Recommendations for tool use

+

Chapter 12 Cover Image

+
    +
  • Tools are a flexible way to augment language models with external data.
  • +
  • Retrieval systems are one example of a tool that can be based on various databases.
  • +
  • Two ways to build tool use into language models: manually describe the logic (chains) or use plugins and let the model figure it out.
  • +
  • Chains are better for reliability and consistent problem-solving.
  • +
  • Plugins are more suitable for interactivity, flexibility, and general-purpose use, allowing users to solve various unanticipated problems.
  • +
+

Recap & conclusions

+

Chapter 13 Cover Image

+
    +
  • LMS are more powerful when connected to external data
  • +
  • Rules and heuristics can help identify useful data
  • +
  • As knowledge base scales, consider it as information retrieval
  • +
  • Chains can encode complex reasoning and help with token limits
  • +
  • Tools can provide access to external knowledge beyond internal database
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_0.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_0.jpg new file mode 100644 index 00000000..9757cfca Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_1.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_1.jpg new file mode 100644 index 00000000..acecfd2b Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_10.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_10.jpg new file mode 100644 index 00000000..77ac2ccb Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_10.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_11.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_11.jpg new file mode 100644 index 00000000..7d2b5123 Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_11.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_2.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_2.jpg new file mode 100644 index 00000000..cd668a62 Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_3.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_3.jpg new file mode 100644 index 00000000..efc6f7f2 Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_4.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_4.jpg new file mode 100644 index 00000000..3372a3f5 Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_5.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_5.jpg new file mode 100644 index 00000000..60bb5953 Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_6.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_6.jpg new file mode 100644 index 00000000..37315c5a Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_7.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_7.jpg new file mode 100644 index 00000000..2caefc5e Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_8.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_8.jpg new file mode 100644 index 00000000..5a843b4c Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_8.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_9.jpg b/llm-bootcamp/spring-2023/chase-agents/chapter_9.jpg new file mode 100644 index 00000000..7048613a Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/chapter_9.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/chapter_summaries/index.html b/llm-bootcamp/spring-2023/chase-agents/chapter_summaries/index.html new file mode 100644 index 00000000..ed014e3e --- /dev/null +++ b/llm-bootcamp/spring-2023/chase-agents/chapter_summaries/index.html @@ -0,0 +1,2399 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

What are "agents"?

+

Chapter 0 Cover Image

+
    +
  • The lecture covers agents and their significance in the context of LangChain.
  • +
  • The core idea of agents is using a language model as a reasoning engine to determine how to interact with the outside world based on user input
  • +
  • First it defines what agents are, explains why they are used, and shows how they are typically implemented.
  • +
  • It also considers the challenges associated with getting agents to work reliably in production.
  • +
  • It touches on memory and recent projects that involve agentic behavior
  • +
+

Why use agents?

+

Chapter 1 Cover Image

+
    +
  • Agents are useful for connecting language models to external sources of data and computation, such as search APIs and databases.
  • +
  • Agents are more flexible and powerful than simply connecting language models to tools, and can handle edge cases and multi-hop tasks better.
  • +
  • The typical implementation of agents involves using the language model to choose a tool, taking action with that tool, observing the output, and feeding it back into the language model until a stopping condition is met.
  • +
  • Stopping conditions can be set by the language model or through hard-coded rules.
  • +
+

ReAct: Reasoning to Act

+

Chapter 2 Cover Image

+
    +
  • ReAct is a prompting strategy for natural language processing
  • +
  • It stands for "Reasoning and Acting"
  • +
  • It combines Chain-of-Thought reasoning and action-taking to improve the language model's ability to reason and access real data sources
  • +
  • It yields higher quality, more reliable results than other prompting techniques
  • +
+

Challenge: controlling tool use

+

Chapter 3 Cover Image

+
    +
  • React is a popular implementation of agency, but there are many challenges
  • +
  • One challenge is getting agents to use tools appropriately, which can be addressed by providing tool descriptions or using tool retrieval
  • +
  • Few-shot examples can guide the language model in what to do
  • +
  • Another challenge is getting agents not to use tools when they don't need to, which can be addressed with reminders or adding a tool that explicitly returns to the user
  • +
+

Challenge: parsing tool invocations

+

Chapter 4 Cover Image

+
    +
  • Language models return raw strings, and we often want to pass those strings into other programs
  • +
  • More structured responses, like those in JSON format, are easier to parse
  • +
  • Output parsers are used to encapsulate the logic needed to parse responses, can be modular, and can retry mistakes
  • +
  • There are subtle differences in fixing errors in response outputs, and output parsers can help with this task
  • +
+

Challenge: long-term memory and coherence

+

Chapter 5 Cover Image

+
    +
  • Fourth challenge is getting agents to remember previous steps
  • +
  • ReAct paper keeps a list of these steps in memory
  • +
  • Long-running tasks present context window issues
  • +
  • Retrieval methods can fetch previous steps and put them into context
  • +
  • Combining some N most recent and some K most relevant actions and observations is common
  • +
  • Incorporating big and hard-to-parse API responses is a challenge
  • +
  • Custom logic can be used to select relevant keys and put them in context
  • +
  • Tool usage requires thinking about output size
  • +
  • Agents can go off track, and reiterating the objective can help
  • +
  • Separating planning and execution steps can help break down objectives
  • +
+

Challenge: evaluation

+

Chapter 6 Cover Image

+
    +
  • Evaluating language models and applications built on top is difficult
  • +
  • Evaluating agents is also difficult
  • +
  • One way to evaluate agents is to measure if the correct result was produced
  • +
  • Another way to evaluate agents is to assess if the agent trajectory or intermediate steps were correct and efficient. Examples include evaluating correct input to action, correct number of steps, and the most efficient sequence of steps.
  • +
  • Evaluating the intermediate steps can be just as useful as evaluating the final result.
  • +
+

Agent memory and adaptability

+

Chapter 7 Cover Image

+
    +
  • Memory is an interesting aspect of AI, especially in the context of user-AI interactions and personalization.
  • +
  • Personalization can be achieved by encoding an agent's objectives and persona in the prompt, but there is also work being done on evolving that over time to give agents a sense of long-term memory.
  • +
  • Memory is becoming increasingly important in the concept of agents as encapsulated programs that adapt over time.
  • +
  • Four recent projects build upon and improve the "react-style" agent, discussed next
  • +
+

Example: AutoGPT

+

Chapter 8 Cover Image

+

points:

+
    +
  • ReAct-style agents are designed to solve a specific objective, with short-lived, immediately quantifiable goals
  • +
  • AutoGPT was created for long-running, open-ended goals such as increasing Twitter following
  • +
  • AutoGPT introduced the concept of long-term memory using a vector store due to the long-running nature of its projects
  • +
+

Example: BabyAGI

+

Chapter 9 Cover Image

+
    +
  • BabyAGI is another popular project for agents for long-running objectives
  • +
  • Introduces separate planning and execution steps to improve long-running objectives
  • +
  • BabyAGI initially didn't have tools, but now has them
  • +
  • Separating planning and execution steps can improve reliability and focus of longer-term agents
  • +
+

Example: CAMEL

+

Chapter 10 Cover Image

+
    +
  • CAMEL paper involves two agents working together, novel idea
  • +
  • The main point of the paper is the use of a simulation environment
  • +
  • Simulation environments can be used for practical evaluation of agents or for entertainment
  • +
  • The paper's results are for a simple "simulation environment" -- two agents interacting in a chat room
  • +
  • The agents were language models without tools
  • +
+

Example: "Generative Agents" paper

+

Chapter 11 Cover Image

+
    +
  • Recent simulation environment that had 25 agents in a Sims-like world
  • +
  • Memory refers to remembering previous events to inform future actions
  • +
  • Three components of memory retrieval: time weighting, importance weighting, relevancy weighting
  • +
  • Reflection step introduced to update different states of the world after observing recent events
  • +
  • Reflection step could be applied to other memory types in LangChain, such as entity memory and summary conversation memory
  • +
  • Other papers recently incorporated the idea of reflection, which is interesting and worth keeping an eye on for the future
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/chase-agents/cover.jpg b/llm-bootcamp/spring-2023/chase-agents/cover.jpg new file mode 100644 index 00000000..555a0b5b Binary files /dev/null and b/llm-bootcamp/spring-2023/chase-agents/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/chase-agents/index.html b/llm-bootcamp/spring-2023/chase-agents/index.html new file mode 100644 index 00000000..3d087559 --- /dev/null +++ b/llm-bootcamp/spring-2023/chase-agents/index.html @@ -0,0 +1,2580 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Harrison Chase: Agents - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Harrison Chase: Agents

+
+ +
+ +

Lecture by Harrison Chase. +Published May 25, 2023. +Download slides.

+

Chapter Summaries

+

What are "agents"?

+

Chapter 0 Cover Image

+
    +
  • The lecture covers agents and their significance in the context of LangChain.
  • +
  • The core idea of agents is using a language model as a reasoning engine to determine how to interact with the outside world based on user input
  • +
  • First it defines what agents are, explains why they are used, and shows how they are typically implemented.
  • +
  • It also considers the challenges associated with getting agents to work reliably in production.
  • +
  • It touches on memory and recent projects that involve agentic behavior
  • +
+

Why use agents?

+

Chapter 1 Cover Image

+
    +
  • Agents are useful for connecting language models to external sources of data and computation, such as search APIs and databases.
  • +
  • Agents are more flexible and powerful than simply connecting language models to tools, and can handle edge cases and multi-hop tasks better.
  • +
  • The typical implementation of agents involves using the language model to choose a tool, taking action with that tool, observing the output, and feeding it back into the language model until a stopping condition is met.
  • +
  • Stopping conditions can be set by the language model or through hard-coded rules.
  • +
+

ReAct: Reasoning to Act

+

Chapter 2 Cover Image

+
    +
  • ReAct is a prompting strategy for natural language processing
  • +
  • It stands for "Reasoning and Acting"
  • +
  • It combines Chain-of-Thought reasoning and action-taking to improve the language model's ability to reason and access real data sources
  • +
  • It yields higher quality, more reliable results than other prompting techniques
  • +
+

Challenge: controlling tool use

+

Chapter 3 Cover Image

+
    +
  • React is a popular implementation of agency, but there are many challenges
  • +
  • One challenge is getting agents to use tools appropriately, which can be addressed by providing tool descriptions or using tool retrieval
  • +
  • Few-shot examples can guide the language model in what to do
  • +
  • Another challenge is getting agents not to use tools when they don't need to, which can be addressed with reminders or adding a tool that explicitly returns to the user
  • +
+

Challenge: parsing tool invocations

+

Chapter 4 Cover Image

+
    +
  • Language models return raw strings, and we often want to pass those strings into other programs
  • +
  • More structured responses, like those in JSON format, are easier to parse
  • +
  • Output parsers are used to encapsulate the logic needed to parse responses, can be modular, and can retry mistakes
  • +
  • There are subtle differences in fixing errors in response outputs, and output parsers can help with this task
  • +
+

Challenge: long-term memory and coherence

+

Chapter 5 Cover Image

+
    +
  • Fourth challenge is getting agents to remember previous steps
  • +
  • ReAct paper keeps a list of these steps in memory
  • +
  • Long-running tasks present context window issues
  • +
  • Retrieval methods can fetch previous steps and put them into context
  • +
  • Combining some N most recent and some K most relevant actions and observations is common
  • +
  • Incorporating big and hard-to-parse API responses is a challenge
  • +
  • Custom logic can be used to select relevant keys and put them in context
  • +
  • Tool usage requires thinking about output size
  • +
  • Agents can go off track, and reiterating the objective can help
  • +
  • Separating planning and execution steps can help break down objectives
  • +
+

Challenge: evaluation

+

Chapter 6 Cover Image

+
    +
  • Evaluating language models and applications built on top is difficult
  • +
  • Evaluating agents is also difficult
  • +
  • One way to evaluate agents is to measure if the correct result was produced
  • +
  • Another way to evaluate agents is to assess if the agent trajectory or intermediate steps were correct and efficient. Examples include evaluating correct input to action, correct number of steps, and the most efficient sequence of steps.
  • +
  • Evaluating the intermediate steps can be just as useful as evaluating the final result.
  • +
+

Agent memory and adaptability

+

Chapter 7 Cover Image

+
    +
  • Memory is an interesting aspect of AI, especially in the context of user-AI interactions and personalization.
  • +
  • Personalization can be achieved by encoding an agent's objectives and persona in the prompt, but there is also work being done on evolving that over time to give agents a sense of long-term memory.
  • +
  • Memory is becoming increasingly important in the concept of agents as encapsulated programs that adapt over time.
  • +
  • Four recent projects build upon and improve the "react-style" agent, discussed next
  • +
+

Example: AutoGPT

+

Chapter 8 Cover Image

+

points:

+
    +
  • ReAct-style agents are designed to solve a specific objective, with short-lived, immediately quantifiable goals
  • +
  • AutoGPT was created for long-running, open-ended goals such as increasing Twitter following
  • +
  • AutoGPT introduced the concept of long-term memory using a vector store due to the long-running nature of its projects
  • +
+

Example: BabyAGI

+

Chapter 9 Cover Image

+
    +
  • BabyAGI is another popular project for agents for long-running objectives
  • +
  • Introduces separate planning and execution steps to improve long-running objectives
  • +
  • BabyAGI initially didn't have tools, but now has them
  • +
  • Separating planning and execution steps can improve reliability and focus of longer-term agents
  • +
+

Example: CAMEL

+

Chapter 10 Cover Image

+
    +
  • CAMEL paper involves two agents working together, novel idea
  • +
  • The main point of the paper is the use of a simulation environment
  • +
  • Simulation environments can be used for practical evaluation of agents or for entertainment
  • +
  • The paper's results are for a simple "simulation environment" -- two agents interacting in a chat room
  • +
  • The agents were language models without tools
  • +
+

Example: "Generative Agents" paper

+

Chapter 11 Cover Image

+
    +
  • Recent simulation environment that had 25 agents in a Sims-like world
  • +
  • Memory refers to remembering previous events to inform future actions
  • +
  • Three components of memory retrieval: time weighting, importance weighting, relevancy weighting
  • +
  • Reflection step introduced to update different states of the world after observing recent events
  • +
  • Reflection step could be applied to other memory types in LangChain, such as entity memory and summary conversation memory
  • +
  • Other papers recently incorporated the idea of reflection, which is interesting and worth keeping an eye on for the future
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/expense/index.html b/llm-bootcamp/spring-2023/expense/index.html new file mode 100644 index 00000000..56f3c8dc --- /dev/null +++ b/llm-bootcamp/spring-2023/expense/index.html @@ -0,0 +1,2235 @@ + + + + + + + + + + + + + + + + + + + + + + + + + How do I expense the bootcamp? - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + + + + +

How do I expense the bootcamp?

+

Submitting a receipt

+

Upon registration, you'll receive an email from our event provider with your ticket +and your order details, including the last 4 digits of the credit card you used, if you used one.

+

This is often sufficient documentation.

+

Verification of attendance

+

If you need your attendance verified, +please email us at admin@fullstackdeeplearning.com +after the event is finished and indicate the address to which we should send a verification email.

+

Requesting approval

+

Some organizations require prior approval for education expenses from a manager.

+

In that case, you can work off of the email template below. +We suggest you customize it to your team and its needs.

+

Keep in mind that you want to make sure that it's clear to your manager +why sending you to this bootcamp is in the company's interest, +not just yours!

+
+

An email template, co-written with ChatGPT

+

Dear {manager},

+

I wanted to bring to your attention a bootcamp that I would love to enroll in. Full Stack Deep Learning is hosting a two-day program that covers the emerging field of application development with Large Language Models (LLMs). FSDL is a respected name that has delivered educational material on productionizing machine learning in formats from large MOOCs to courses at top universities like UC Berkeley and UW.

+

This bootcamp will provide me with the knowledge and skills necessary to build and deploy LLM applications and stay up-to-date with the state-of-the-art in the industry.

+

Some of the benefits:

+
    +
  • Coverage of both conceptual fundamentals and concrete engineering practices
  • +
  • Talks from leaders in the field, like Harrison Chase of LangChain
  • +
  • Opportunities to network with builders
  • +
+

I expect to be able to bring back my learnings and apply them directly to my work and share them with the team.

+

The cost of the bootcamp is {price}. I strongly believe that this investment in my education will benefit the company in the long run.

+

If you are interested, you can find more details about the bootcamp, including the instructors and their bios, at the following link: https://fsdl.me/2023-llmbc-landing.

+

Thank you for considering my request.

+

Best,

+

{name}

+
+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/index.html b/llm-bootcamp/spring-2023/index.html new file mode 100644 index 00000000..5638788b --- /dev/null +++ b/llm-bootcamp/spring-2023/index.html @@ -0,0 +1,2570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LLM Bootcamp - Spring 2023 - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + + + + +

LLM Bootcamp - Spring 2023

+
+

What are the pre-requisites for this bootcamp?

+

Our goal is to get you 100% caught up to state-of-the-art and ready to build and deploy LLM apps, no matter what your level of experience with machine learning is.

+

Please enjoy, and email us, tweet us, or post in our Discord if you have any questions or feedback!

+
+

Lectures

+
+
+

Learn to Spell: Prompt Engineering

+

+Prompt Engineering Lecture Cover +

+
    +
  • High-level intuitions for prompting +
  • Tips and tricks for effective prompting: decomposition/chain-of-thought, self-criticism, ensembling +
  • Gotchas: "few-shot learning" and tokenization +
  • + + +
+
+
+

LLMOps

+

+ LLMOps Lecture Cover +

+
    +
  • Comparing and evaluating open source and proprietary models +
  • Iteration and prompt management +
  • Applying test-driven-development and continuous integration to LLMs +
  • + + +
+
+
+

UX for Language User Interfaces

+

+ UX for LUIs Lecture Cover + +

+
    +
  • General principles for user-centered design +
  • Emerging patterns in UX design for LUIs +
  • UX case studies: GitHub Copilot and Bing Chat +
+
+
+

Augmented Language Models

+

+ + +

+
    +
  • Augmenting language model inputs with external knowledge +
  • Vector indices and embedding management systems +
  • Augmenting language model outputs with external tools +
+
+
+

Launch an LLM App in One Hour

+

+ + +

+
    +
  • Why is now the right time to build? +
  • Techniques and tools for the tinkering and discovery phase: ChatGPT, LangChain, Colab +
  • A simple stack for quickly launching augmented LLM applications +
+
+
+

LLM Foundations

+

+ + +

+
    +
  • Speed-run of ML fundamentals +
  • The Transformer architecture +
  • Notable LLMs and their datasets +
+
+
+

Project Walkthrough: askFSDL

+

+ + +

+
    +
      +
    • Walkthrough of a GitHub repo for sourced Q&A with LLMs
    • +
    • Try it out via a bot in our Discord
    • +
    • Python project tooling, ETL/data processing, deployment on Modal, and monitoring with Gantry +
    • +
    +
+
+
+

What's Next?

+

+What's Next Lecture Cover + +

+
    +
  • Can we build general purpose robots using multimodal models? +
  • Will models get bigger or smaller? Are we running out of data? +
  • How close are we to AGI? Can we make it safe? +
+
+
+

Invited Talks

+
+
+

Reza Shabani: How To Train Your Own LLM

+

+Shabani Train Your Own Talk cover +

+
    +
  • The "Modern LLM Stack": Databricks, Hugging Face, MosaicML, and more +
  • The importance of knowing your data and designing preprocessing carefully +
  • The features of a good LLM engineer +
  • By Reza Shabani, who trained Replit's code completion model, Ghostwriter. +
  • + + + +
+
+
+

Harrison Chase: Agents

+

+Chase Agents Talk cover +

+
    +
  • The "agent" design pattern: tool use, memory, reflection, and goals +
  • Challenges facing agents in production: controlling tool use, parsing outputs, handling large contexts, and more +
  • Exciting research projects with agents: AutoGPT, BabyAGI, CAMEL, and Generative Agents +
  • By Harrison Chase, co-creator of LangChain +
  • + + + +
+
+
+

Fireside Chat with Peter Welinder

+

+Welinder Chat cover +

+
    +
  • With Peter Welinder, VP of Product & Partnerships at OpenAI +
  • How OpenAI converged on LLMs +
  • Learnings and surprises from releasing ChatGPT +
  • + + +
+
+
+

Sponsors

+

We are deeply grateful to all of the sponsors who helped make this event happen.

+

Direct Sponsors

+
+ + + + + + + + + + + + +
+ +

Compute Credit Sponsors

+
+ + + + + + +
+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_0.jpg b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_0.jpg new file mode 100644 index 00000000..7bf63723 Binary files /dev/null and b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_1.jpg b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_1.jpg new file mode 100644 index 00000000..0b14c0a8 Binary files /dev/null and b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_2.jpg b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_2.jpg new file mode 100644 index 00000000..23dd6460 Binary files /dev/null and b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_3.jpg b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_3.jpg new file mode 100644 index 00000000..d2652e53 Binary files /dev/null and b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/index.html b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/index.html new file mode 100644 index 00000000..d43ceb9d --- /dev/null +++ b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/index.html @@ -0,0 +1,2262 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+ +
+
+ + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Why now?

+

Chapter 0 Cover Image

+
    +
  • Excitement about large language models and artificial intelligence is high, especially since one tool can now accomplish tasks that previously required multiple specialized tools.
  • +
  • Language user interfaces (LUIs) enable more natural interaction with computers through speech and natural language. Large language models, like GPT-3, make LUIs more flexible and capable.
  • +
  • Products and applications are being built with these models, including OpenAI's ChatGPT and GitHub Copilot, hinting at a promising future.
  • +
  • However, the gap between demos and actual products is significant. Overpromising and underdelivering in the past led to "AI winters," so it's important to create valuable products and tools to maintain funding and interest.
  • +
  • The playbook for building applications with language models is emerging, and this boot camp will cover aspects of that process.
  • +
+

Prototyping & Iteration in a Playground

+

Chapter 1 Cover Image

+
    +
  • Attended various hackathons focused on using machine learning tools
  • +
  • Explored the potential of high-capability hosted models, such as OpenAI's, in a simple chat interface to quickly test capabilities
  • +
  • Used a notebook environment for quick tinkering, building prototypes, and discovering limitations of language models
  • +
  • Started with a problem statement: using large language models to learn about large language models
  • +
  • Discovered difficulties with language models, such as having outdated and limited information
  • +
  • Found that providing specific sources or papers can help improve answers from the model
  • +
+

Prototyping & Iteration in a Notebook

+

Chapter 2 Cover Image

+
    +
  • Experiment with automating steps in ephemeral notebook environment like Collab.
  • +
  • OpenAI API allows interaction with language models and offers various SDKs.
  • +
  • Lang chain is a popular open-source framework for interacting with these models; it's fast-evolving and provides all necessary components.
  • +
  • Develop a process to find information and bring it to context. Utilize Python libraries like archive for data sourcing.
  • +
  • Utilize document loaders, such as the one built into Lang chain, to extract content from PDFs.
  • +
  • Use embedding search for large scale information retrieval within documents.
  • +
  • Prototype and tinker with language models to constantly improve them.
  • +
  • Look for similar existing projects to jump off or even default examples provided, such as Lang chain's default example.
  • +
  • Turn these experiments into something usable by people at a larger scale.
  • +
  • The workflow with modern language models is more flexible and faster compared to the past machine learning processes.
  • +
+

Deploying an MVP

+

Chapter 3 Cover Image

+
    +
  • Building an MVP version of an application requires focusing on what's useful to a broad range of users.
  • +
  • Prioritize the user interface and gather feedback from users quickly.
  • +
  • Cloud-native tooling and serverless infrastructure like Model are helpful in swiftly scaling applications and addressing data processing bottlenecks.
  • +
  • Use various tech stacks for different tasks, such as OpenAI for language models, Pinecone for quick search, MongoDB for data storage, and AWS for running lightweight Discord bot servers.
  • +
  • Implement the application, then monitor usage data to make improvements and learn from successes and failures.
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/cover.jpg b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/cover.jpg new file mode 100644 index 00000000..2316edbd Binary files /dev/null and b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/index.html b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/index.html new file mode 100644 index 00000000..ec8edef4 --- /dev/null +++ b/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/index.html @@ -0,0 +1,2371 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Launch an LLM App in One Hour - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + + +

Launch an LLM App in One Hour

+
+ +
+ +

Lecture by Charles Frye. +Published May 9, 2023. +Download slides.

+

Chapter Summaries

+

Why now?

+

Chapter 0 Cover Image

+
    +
  • Excitement about large language models and artificial intelligence is high, especially since one tool can now accomplish tasks that previously required multiple specialized tools.
  • +
  • Language user interfaces (LUIs) enable more natural interaction with computers through speech and natural language. Large language models, like GPT-3, make LUIs more flexible and capable.
  • +
  • Products and applications are being built with these models, including OpenAI's ChatGPT and GitHub Copilot, hinting at a promising future.
  • +
  • However, the gap between demos and actual products is significant. Overpromising and underdelivering in the past led to "AI winters," so it's important to create valuable products and tools to maintain funding and interest.
  • +
  • The playbook for building applications with language models is emerging, and this boot camp will cover aspects of that process.
  • +
+

Prototyping & Iteration in a Playground

+

Chapter 1 Cover Image

+
    +
  • Attended various hackathons focused on using machine learning tools
  • +
  • Explored the potential of high-capability hosted models, such as OpenAI's, in a simple chat interface to quickly test capabilities
  • +
  • Used a notebook environment for quick tinkering, building prototypes, and discovering limitations of language models
  • +
  • Started with a problem statement: using large language models to learn about large language models
  • +
  • Discovered difficulties with language models, such as having outdated and limited information
  • +
  • Found that providing specific sources or papers can help improve answers from the model
  • +
+

Prototyping & Iteration in a Notebook

+

Chapter 2 Cover Image

+
    +
  • Experiment with automating steps in ephemeral notebook environment like Collab.
  • +
  • OpenAI API allows interaction with language models and offers various SDKs.
  • +
  • Lang chain is a popular open-source framework for interacting with these models; it's fast-evolving and provides all necessary components.
  • +
  • Develop a process to find information and bring it to context. Utilize Python libraries like archive for data sourcing.
  • +
  • Utilize document loaders, such as the one built into Lang chain, to extract content from PDFs.
  • +
  • Use embedding search for large scale information retrieval within documents.
  • +
  • Prototype and tinker with language models to constantly improve them.
  • +
  • Look for similar existing projects to jump off or even default examples provided, such as Lang chain's default example.
  • +
  • Turn these experiments into something usable by people at a larger scale.
  • +
  • The workflow with modern language models is more flexible and faster compared to the past machine learning processes.
  • +
+

Deploying an MVP

+

Chapter 3 Cover Image

+
    +
  • Building an MVP version of an application requires focusing on what's useful to a broad range of users.
  • +
  • Prioritize the user interface and gather feedback from users quickly.
  • +
  • Cloud-native tooling and serverless infrastructure like Model are helpful in swiftly scaling applications and addressing data processing bottlenecks.
  • +
  • Use various tech stacks for different tasks, such as OpenAI for language models, Pinecone for quick search, MongoDB for data storage, and AWS for running lightweight Discord bot servers.
  • +
  • Implement the application, then monitor usage data to make improvements and learn from successes and failures.
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_0.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_0.jpg new file mode 100644 index 00000000..7b6f5cd4 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_1.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_1.jpg new file mode 100644 index 00000000..bcb67ae9 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_10.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_10.jpg new file mode 100644 index 00000000..a6c4ecad Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_10.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_11.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_11.jpg new file mode 100644 index 00000000..571ed318 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_11.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_12.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_12.jpg new file mode 100644 index 00000000..3192cb1c Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_12.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_13.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_13.jpg new file mode 100644 index 00000000..4ab36a9b Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_13.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_14.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_14.jpg new file mode 100644 index 00000000..ef5b29b7 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_14.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_15.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_15.jpg new file mode 100644 index 00000000..544844d3 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_15.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_16.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_16.jpg new file mode 100644 index 00000000..621dc422 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_16.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_17.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_17.jpg new file mode 100644 index 00000000..69e1fc5f Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_17.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_18.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_18.jpg new file mode 100644 index 00000000..0c13fb9a Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_18.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_2.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_2.jpg new file mode 100644 index 00000000..181fe936 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_3.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_3.jpg new file mode 100644 index 00000000..ed6c783e Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_4.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_4.jpg new file mode 100644 index 00000000..bc90a23e Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_5.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_5.jpg new file mode 100644 index 00000000..35802edb Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_6.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_6.jpg new file mode 100644 index 00000000..f3261414 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_7.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_7.jpg new file mode 100644 index 00000000..35d855d8 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_8.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_8.jpg new file mode 100644 index 00000000..865447ad Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_8.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_9.jpg b/llm-bootcamp/spring-2023/llm-foundations/chapter_9.jpg new file mode 100644 index 00000000..6417905b Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/chapter_9.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/index.html b/llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/index.html new file mode 100644 index 00000000..502dc963 --- /dev/null +++ b/llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/index.html @@ -0,0 +1,2539 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Discuss four key ideas in machine learning
  • +
  • Address diverse audience, including experts, executives, and investors
  • +
  • Cover Transformer architecture
  • +
  • Mention notable LLMs (e.g., GPT, T5, BERT, etc.)
  • +
  • Share details on running a Transformer
  • +
+

Foundations of Machine Learning

+

Chapter 1 Cover Image

+
    +
  • Machine learning has shifted from traditional programming (Software 1.0) to a Software 2.0 mindset, where algorithms are generated from training data and more emphasis is placed on the training system.
  • +
  • Three types of machine learning include unsupervised learning, supervised learning, and reinforcement learning, which have mostly converged to a supervised learning approach.
  • +
  • For machines, input and output are always just numbers, represented as vectors or matrices.
  • +
  • One dominant approach to machine learning today is neural networks, also known as deep learning, which was inspired by the human brain's structure and function.
  • +
  • Neural networks consist of perceptrons connected in layers, and all operations are matrix multiplications.
  • +
  • GPUs, originally developed for graphics and video games, have played a significant role in advancing deep learning due to their compatibility with matrix multiplications.
  • +
  • To train a neural network, data is typically split into training, validation, and test sets to avoid overfitting and improve model performance.
  • +
  • Pre-training involves training a large model on extensive data, which can then be fine-tuned using smaller sets of specialized data for better performance.
  • +
  • Model hubs, such as Hugging Face, offer numerous pre-trained models for various machine learning tasks and have seen significant growth in recent years.
  • +
  • The Transformer model has become the dominant architecture for a wide range of machine learning tasks.
  • +
+

The Transformer Architecture

+

Chapter 2 Cover Image

+
    +
  • Transformer architecture introduced in 2017 paper "Attention is All You Need"
  • +
  • Set state-of-the-art results in translation tasks
  • +
  • Applied to other NLP tasks and fields like vision
  • +
  • Appears complicated but consists of two similar halves
  • +
  • Focusing on one half called the decoder
  • +
+

Transformer Decoder Overview

+

Chapter 3 Cover Image

+
    +
  • The task of the Transformer decoder is to complete text, much like GPT models.
  • +
  • The input consists of a sequence of tokens (e.g., "it's a blue"), and the goal is to predict the next word (e.g., "sundress").
  • +
  • The output is a probability distribution over potential next tokens.
  • +
  • Inference involves sampling a token from the distribution, appending it to the input, and running the model again with the updated input.
  • +
  • ChatGPT operates by seeing user input, sampling the next word, appending it, and repeating this process.
  • +
+

Inputs

+

Chapter 4 Cover Image

+
    +
  • Inputs need to be vectors of numbers
  • +
  • Text is turned into vectors through tokenization
  • +
  • Tokens are assigned an ID in a vocabulary, rather than being words
  • +
  • Numbers are represented as vectors using one-hot encoding (e.g., number 3 represented by a vector with 1 in third position, zeros everywhere else)
  • +
+

Input Embedding

+

Chapter 5 Cover Image

+
    +
  • One-hot vectors are not good representations of words or tokens as they don't capture the notion of similarity between words
  • +
  • To address the issue, we use embedding
  • +
  • Embedding involves learning an embedding matrix which converts a one-hot vocabulary encoding into a dense vector of chosen dimensionalities
  • +
  • This process turns words into dense embeddings, making it the simplest neural network layer type
  • +
+

Masked Multi-Head Attention

+

Chapter 6 Cover Image

+
    +
  • Attention was introduced in 2015 for translation tasks, and the idea is to predict the most likely next token based on the importance of previous tokens.
  • +
  • Attention mechanism involves an output as a weighted sum of input vectors, and these weights are calculated using dot products (similarities) between the input vectors.
  • +
  • Each input vector plays three roles in the attention mechanism: as a query, key, and value.
  • +
  • To learn and improve attention, input vectors can be projected into different roles (query, key, and value) by multiplying them with learnable matrices.
  • +
  • Multi-head attention refers to learning several different ways of transforming inputs into queries, keys, and values simultaneously.
  • +
  • Masking is used to prevent the model from "cheating" by considering future tokens; it ensures that the model only predicts the next token based on the already seen input.
  • +
+

Positional Encoding

+

Chapter 7 Cover Image

+
    +
  • No notion of position in the current model, only whether something has been seen or not.
  • +
  • Positional encoding is introduced to provide ordering among the seen elements.
  • +
  • Current equations resemble a bag of unordered items.
  • +
  • Positional encoding vectors are added to embedding vectors to provide order.
  • +
  • Seems counterintuitive, but it works; attention mechanism figures out relevant positions.
  • +
+

Skip Connections and Layer Norm

+

Chapter 8 Cover Image

+
    +
  • Add up and norm attention outputs using skip connections and layer normalization
  • +
  • Skip connections help propagate loss from end to beginning of model during backpropagation
  • +
  • Layer normalization resets mean and standard deviation to uniform after every operation
  • +
  • Input embedding determines the dimension of the entire Transformer model
  • +
  • Normalization seems inelegant but is very effective in improving neural net learning
  • +
+

Feed-forward Layer

+

Chapter 9 Cover Image

+
    +
  • Feed forward layer is similar to the standard multi-layer perceptron.
  • +
  • It receives tokens augmented with relevant information.
  • +
  • The layer upgrades the token representation.
  • +
  • The process goes from word-level to thought-level, with more semantic meaning.
  • +
+

Transformer hyperparameters and Why they work so well

+

Chapter 10 Cover Image

+
    +
  • GPT-3 model ranges from 12 to 96 layers of Transformer layers with adjustable embedding dimensions and attention heads, totaling 175 billion parameters.
  • +
  • Most of GPT-3's parameters are in the feed forward layer, but for smaller models, a significant portion is in embedding and attention.
  • +
  • Transformers are effective general-purpose differentiable computers that are expressive, optimizable via backpropagation, and efficient due to parallel processing.
  • +
  • Understanding exact expressiveness of the Transformer is ongoing, with interesting results like RASP (a programming language designed to be implemented within a Transformer).
  • +
  • Decompiling Transformer weights back to a program is still an unsolved problem.
  • +
  • Multiple attention heads allow the model to figure out how to use a second head, showcased in work like Induction Heads.
  • +
  • Learning to code Transformers isn't necessary for AI-powered products, but can be fun and educational. Resources like YouTube tutorials and code examples are available to assist in learning.
  • +
+

Notable LLM: BERT

+

Chapter 11 Cover Image

+
    +
  • Bert, T5, and GPT cover the gamut of large Transformer models
  • +
  • Bert stands for bi-directional encoder representation from Transformers
  • +
  • Bert uses the encoder part of the Transformer, with unmasked attention
  • +
  • Bert contains 100 million parameters, considered large at its time
  • +
  • Bert was trained by masking 15% of words in a text corpus and predicting the masked words
  • +
  • Bert became a building block for other NLP applications
  • +
+

Notable LLM: T5

+

Chapter 12 Cover Image

+
    +
  • T5 applies Transformer architecture to text-to-text transfer, meaning both input and output are text strings
  • +
  • The task is encoded in the input string and can involve translation, summarization, etc.
  • +
  • Encoder-decoder architecture was found to be best, with 11 billion parameters
  • +
  • Trained on Colossal Queen Crawl Corpus (C4) derived from Common Crawl dataset
  • +
  • C4 was created by filtering out short pages, offensive content, pages with code, and de-duplicating data
  • +
  • Fine-tuned using academic supervised tasks for various NLP applications
  • +
+

Notable LLM: GPT

+

Chapter 13 Cover Image

+
    +
  • GPT is a generative pre-trained Transformer, with GPT-2 being decoder only
  • +
  • GPT-2 was trained on a dataset called WebText created by scraping links from Reddit
  • +
  • GPT tokenizes text using byte pair encoding, a middle ground between old-school tokenization and using UTF-8 bytes
  • +
  • GPT-3 came out in 2020 and is 100 times larger than GPT-2, enabling few-shot and zero-shot learning
  • +
  • GPT-3 was trained on webtext, raw common crawl data, a selection of books, and all of Wikipedia
  • +
  • The dataset for GPT-3 contained 500 billion tokens, but it was only trained on 300 billion tokens
  • +
  • GPT-4 details are unknown, but it is assumed to be much larger than previous versions due to the trend in increasing size
  • +
+

Notable LLM: Chinchilla and Scaling Laws

+

Chapter 14 Cover Image

+
    +
  • Using more computation to train AI systems improves their performance
  • +
  • Rich Sutton's "bitter lesson": advantage goes to those stacking more layers
  • +
  • DeepMind's paper, Training Compute Optimal LLMs: studied relationship between model size, compute and data set size
  • +
  • Most LLMs in literature had too many parameters for their data amount
  • +
  • Chinchilla model (70 billion) outperformed Gopher model (four times larger) by training on 1.4 trillion tokens instead of 300 billion
  • +
  • Open question: can models continue to improve by training repeatedly on existing data?
  • +
+

Notable LLM: LLaMA

+

Chapter 15 Cover Image

+
    +
  • Llama is an open-source chinchilla optimal LLM from Meta Research
  • +
  • Several sizes available, ranging from 7 billion to 65 billion, with at least 1 trillion tokens
  • +
  • Competitively benchmarks against GPT-3 and other state-of-the-art LLMs
  • +
  • Open source but non-commercial license for pre-trained weights
  • +
  • Trained on custom common crawl filtering, C4, GitHub, Wikipedia, books, and scientific papers
  • +
  • Data set replicated by Red Pajama, which is also training models to replicate Llama
  • +
  • Interesting inclusion of GitHub as a training resource
  • +
+

Why include code in LLM training data?

+

Chapter 16 Cover Image

+
    +
  • Including code in training data can improve performance on non-code tasks
  • +
  • OpenAI found this with their Codex model, which was fine-tuned on code and outperformed GPT-3 on reasoning tasks
  • +
  • Since then, people have been adding code to training data
  • +
  • Open source dataset called 'the stack' collects code from GitHub while respecting licenses
  • +
+

Instruction Tuning

+

Chapter 17 Cover Image

+
    +
  • Discusses instruction tuning in GPT models and its impact on performance
  • +
  • Mentions the shift from text completion mindset to instruction following mindset
  • +
  • Supervised fine-tuning helps models become better at zero-shot tasks by using data sets of zero-shot inputs and desired outputs
  • +
  • OpenAI hired thousands of contractors to gather zero-shot data and used reinforcement learning for training
  • +
  • GPT model lineage includes DaVinci, Codex, and various iterations, fine-tuning for specific applications
  • +
  • Fine-tuning imposes an "alignment tax," decreasing few-shot learning ability and model's confidence calibration
  • +
  • Llama model by Stanford team used GPT-3 generated instructions, costing less but with reduced performance compared to GPT-3
  • +
  • A specific data set for instruction tuning in chat-based paradigms is called "Open Assistant"
  • +
+

Notable LLM: RETRO

+

Chapter 18 Cover Image

+
    +
  • Discussing a model called "retrieval enhancing" from DeepMind
  • +
  • Goal: train a smaller model good at reasoning and writing code, but looks up facts from a database
  • +
  • Used "burden-coded" sentences in a trillion-token database for fact retrieval
  • +
  • Not as effective as large language models yet, but shows potential for the future
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/llm-foundations/cover.jpg b/llm-bootcamp/spring-2023/llm-foundations/cover.jpg new file mode 100644 index 00000000..5f1649f7 Binary files /dev/null and b/llm-bootcamp/spring-2023/llm-foundations/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/llm-foundations/index.html b/llm-bootcamp/spring-2023/llm-foundations/index.html new file mode 100644 index 00000000..35ef55c2 --- /dev/null +++ b/llm-bootcamp/spring-2023/llm-foundations/index.html @@ -0,0 +1,2783 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LLM Foundations - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

LLM Foundations

+
+ +
+ +

Lecture by Sergey Karayev. +Published May 19, 2023. +Download slides.

+

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Discuss four key ideas in machine learning
  • +
  • Address diverse audience, including experts, executives, and investors
  • +
  • Cover Transformer architecture
  • +
  • Mention notable LLMs (e.g., GPT, T5, BERT, etc.)
  • +
  • Share details on running a Transformer
  • +
+

Foundations of Machine Learning

+

Chapter 1 Cover Image

+
    +
  • Machine learning has shifted from traditional programming (Software 1.0) to a Software 2.0 mindset, where algorithms are generated from training data and more emphasis is placed on the training system.
  • +
  • Three types of machine learning include unsupervised learning, supervised learning, and reinforcement learning, which have mostly converged to a supervised learning approach.
  • +
  • For machines, input and output are always just numbers, represented as vectors or matrices.
  • +
  • One dominant approach to machine learning today is neural networks, also known as deep learning, which was inspired by the human brain's structure and function.
  • +
  • Neural networks consist of perceptrons connected in layers, and all operations are matrix multiplications.
  • +
  • GPUs, originally developed for graphics and video games, have played a significant role in advancing deep learning due to their compatibility with matrix multiplications.
  • +
  • To train a neural network, data is typically split into training, validation, and test sets to avoid overfitting and improve model performance.
  • +
  • Pre-training involves training a large model on extensive data, which can then be fine-tuned using smaller sets of specialized data for better performance.
  • +
  • Model hubs, such as Hugging Face, offer numerous pre-trained models for various machine learning tasks and have seen significant growth in recent years.
  • +
  • The Transformer model has become the dominant architecture for a wide range of machine learning tasks.
  • +
+

The Transformer Architecture

+

Chapter 2 Cover Image

+
    +
  • Transformer architecture introduced in 2017 paper "Attention is All You Need"
  • +
  • Set state-of-the-art results in translation tasks
  • +
  • Applied to other NLP tasks and fields like vision
  • +
  • Appears complicated but consists of two similar halves
  • +
  • Focusing on one half called the decoder
  • +
+

Transformer Decoder Overview

+

Chapter 3 Cover Image

+
    +
  • The task of the Transformer decoder is to complete text, much like GPT models.
  • +
  • The input consists of a sequence of tokens (e.g., "it's a blue"), and the goal is to predict the next word (e.g., "sundress").
  • +
  • The output is a probability distribution over potential next tokens.
  • +
  • Inference involves sampling a token from the distribution, appending it to the input, and running the model again with the updated input.
  • +
  • ChatGPT operates by seeing user input, sampling the next word, appending it, and repeating this process.
  • +
+

Inputs

+

Chapter 4 Cover Image

+
    +
  • Inputs need to be vectors of numbers
  • +
  • Text is turned into vectors through tokenization
  • +
  • Tokens are assigned an ID in a vocabulary, rather than being words
  • +
  • Numbers are represented as vectors using one-hot encoding (e.g., number 3 represented by a vector with 1 in third position, zeros everywhere else)
  • +
+

Input Embedding

+

Chapter 5 Cover Image

+
    +
  • One-hot vectors are not good representations of words or tokens as they don't capture the notion of similarity between words
  • +
  • To address the issue, we use embedding
  • +
  • Embedding involves learning an embedding matrix which converts a one-hot vocabulary encoding into a dense vector of chosen dimensionalities
  • +
  • This process turns words into dense embeddings, making it the simplest neural network layer type
  • +
+

Masked Multi-Head Attention

+

Chapter 6 Cover Image

+
    +
  • Attention was introduced in 2015 for translation tasks, and the idea is to predict the most likely next token based on the importance of previous tokens.
  • +
  • Attention mechanism involves an output as a weighted sum of input vectors, and these weights are calculated using dot products (similarities) between the input vectors.
  • +
  • Each input vector plays three roles in the attention mechanism: as a query, key, and value.
  • +
  • To learn and improve attention, input vectors can be projected into different roles (query, key, and value) by multiplying them with learnable matrices.
  • +
  • Multi-head attention refers to learning several different ways of transforming inputs into queries, keys, and values simultaneously.
  • +
  • Masking is used to prevent the model from "cheating" by considering future tokens; it ensures that the model only predicts the next token based on the already seen input.
  • +
+

Positional Encoding

+

Chapter 7 Cover Image

+
    +
  • No notion of position in the current model, only whether something has been seen or not.
  • +
  • Positional encoding is introduced to provide ordering among the seen elements.
  • +
  • Current equations resemble a bag of unordered items.
  • +
  • Positional encoding vectors are added to embedding vectors to provide order.
  • +
  • Seems counterintuitive, but it works; attention mechanism figures out relevant positions.
  • +
+

Skip Connections and Layer Norm

+

Chapter 8 Cover Image

+
    +
  • Add up and norm attention outputs using skip connections and layer normalization
  • +
  • Skip connections help propagate loss from end to beginning of model during backpropagation
  • +
  • Layer normalization resets mean and standard deviation to uniform after every operation
  • +
  • Input embedding determines the dimension of the entire Transformer model
  • +
  • Normalization seems inelegant but is very effective in improving neural net learning
  • +
+

Feed-forward Layer

+

Chapter 9 Cover Image

+
    +
  • Feed forward layer is similar to the standard multi-layer perceptron.
  • +
  • It receives tokens augmented with relevant information.
  • +
  • The layer upgrades the token representation.
  • +
  • The process goes from word-level to thought-level, with more semantic meaning.
  • +
+

Transformer hyperparameters and Why they work so well

+

Chapter 10 Cover Image

+
    +
  • GPT-3 model ranges from 12 to 96 layers of Transformer layers with adjustable embedding dimensions and attention heads, totaling 175 billion parameters.
  • +
  • Most of GPT-3's parameters are in the feed forward layer, but for smaller models, a significant portion is in embedding and attention.
  • +
  • Transformers are effective general-purpose differentiable computers that are expressive, optimizable via backpropagation, and efficient due to parallel processing.
  • +
  • Understanding exact expressiveness of the Transformer is ongoing, with interesting results like RASP (a programming language designed to be implemented within a Transformer).
  • +
  • Decompiling Transformer weights back to a program is still an unsolved problem.
  • +
  • Multiple attention heads allow the model to figure out how to use a second head, showcased in work like Induction Heads.
  • +
  • Learning to code Transformers isn't necessary for AI-powered products, but can be fun and educational. Resources like YouTube tutorials and code examples are available to assist in learning.
  • +
+

Notable LLM: BERT

+

Chapter 11 Cover Image

+
    +
  • Bert, T5, and GPT cover the gamut of large Transformer models
  • +
  • Bert stands for bi-directional encoder representation from Transformers
  • +
  • Bert uses the encoder part of the Transformer, with unmasked attention
  • +
  • Bert contains 100 million parameters, considered large at its time
  • +
  • Bert was trained by masking 15% of words in a text corpus and predicting the masked words
  • +
  • Bert became a building block for other NLP applications
  • +
+

Notable LLM: T5

+

Chapter 12 Cover Image

+
    +
  • T5 applies Transformer architecture to text-to-text transfer, meaning both input and output are text strings
  • +
  • The task is encoded in the input string and can involve translation, summarization, etc.
  • +
  • Encoder-decoder architecture was found to be best, with 11 billion parameters
  • +
  • Trained on Colossal Queen Crawl Corpus (C4) derived from Common Crawl dataset
  • +
  • C4 was created by filtering out short pages, offensive content, pages with code, and de-duplicating data
  • +
  • Fine-tuned using academic supervised tasks for various NLP applications
  • +
+

Notable LLM: GPT

+

Chapter 13 Cover Image

+
    +
  • GPT is a generative pre-trained Transformer, with GPT-2 being decoder only
  • +
  • GPT-2 was trained on a dataset called WebText created by scraping links from Reddit
  • +
  • GPT tokenizes text using byte pair encoding, a middle ground between old-school tokenization and using UTF-8 bytes
  • +
  • GPT-3 came out in 2020 and is 100 times larger than GPT-2, enabling few-shot and zero-shot learning
  • +
  • GPT-3 was trained on webtext, raw common crawl data, a selection of books, and all of Wikipedia
  • +
  • The dataset for GPT-3 contained 500 billion tokens, but it was only trained on 300 billion tokens
  • +
  • GPT-4 details are unknown, but it is assumed to be much larger than previous versions due to the trend in increasing size
  • +
+

Notable LLM: Chinchilla and Scaling Laws

+

Chapter 14 Cover Image

+
    +
  • Using more computation to train AI systems improves their performance
  • +
  • Rich Sutton's "bitter lesson": advantage goes to those stacking more layers
  • +
  • DeepMind's paper, Training Compute Optimal LLMs: studied relationship between model size, compute and data set size
  • +
  • Most LLMs in literature had too many parameters for their data amount
  • +
  • Chinchilla model (70 billion) outperformed Gopher model (four times larger) by training on 1.4 trillion tokens instead of 300 billion
  • +
  • Open question: can models continue to improve by training repeatedly on existing data?
  • +
+

Notable LLM: LLaMA

+

Chapter 15 Cover Image

+
    +
  • Llama is an open-source chinchilla optimal LLM from Meta Research
  • +
  • Several sizes available, ranging from 7 billion to 65 billion, with at least 1 trillion tokens
  • +
  • Competitively benchmarks against GPT-3 and other state-of-the-art LLMs
  • +
  • Open source but non-commercial license for pre-trained weights
  • +
  • Trained on custom common crawl filtering, C4, GitHub, Wikipedia, books, and scientific papers
  • +
  • Data set replicated by Red Pajama, which is also training models to replicate Llama
  • +
  • Interesting inclusion of GitHub as a training resource
  • +
+

Why include code in LLM training data?

+

Chapter 16 Cover Image

+
    +
  • Including code in training data can improve performance on non-code tasks
  • +
  • OpenAI found this with their Codex model, which was fine-tuned on code and outperformed GPT-3 on reasoning tasks
  • +
  • Since then, people have been adding code to training data
  • +
  • Open source dataset called 'the stack' collects code from GitHub while respecting licenses
  • +
+

Instruction Tuning

+

Chapter 17 Cover Image

+
    +
  • Discusses instruction tuning in GPT models and its impact on performance
  • +
  • Mentions the shift from text completion mindset to instruction following mindset
  • +
  • Supervised fine-tuning helps models become better at zero-shot tasks by using data sets of zero-shot inputs and desired outputs
  • +
  • OpenAI hired thousands of contractors to gather zero-shot data and used reinforcement learning for training
  • +
  • GPT model lineage includes DaVinci, Codex, and various iterations, fine-tuning for specific applications
  • +
  • Fine-tuning imposes an "alignment tax," decreasing few-shot learning ability and model's confidence calibration
  • +
  • Llama model by Stanford team used GPT-3 generated instructions, costing less but with reduced performance compared to GPT-3
  • +
  • A specific data set for instruction tuning in chat-based paradigms is called "Open Assistant"
  • +
+

Notable LLM: RETRO

+

Chapter 18 Cover Image

+
    +
  • Discussing a model called "retrieval enhancing" from DeepMind
  • +
  • Goal: train a smaller model good at reasoning and writing code, but looks up facts from a database
  • +
  • Used "burden-coded" sentences in a trillion-token database for fact retrieval
  • +
  • Not as effective as large language models yet, but shows potential for the future
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/llmops/chapter_0.jpg b/llm-bootcamp/spring-2023/llmops/chapter_0.jpg new file mode 100644 index 00000000..4ed38c05 Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_1.jpg b/llm-bootcamp/spring-2023/llmops/chapter_1.jpg new file mode 100644 index 00000000..40b6cf0f Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_2.jpg b/llm-bootcamp/spring-2023/llmops/chapter_2.jpg new file mode 100644 index 00000000..ea1a3362 Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_3.jpg b/llm-bootcamp/spring-2023/llmops/chapter_3.jpg new file mode 100644 index 00000000..ca678b94 Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_4.jpg b/llm-bootcamp/spring-2023/llmops/chapter_4.jpg new file mode 100644 index 00000000..9f40e4da Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_5.jpg b/llm-bootcamp/spring-2023/llmops/chapter_5.jpg new file mode 100644 index 00000000..70691f04 Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_6.jpg b/llm-bootcamp/spring-2023/llmops/chapter_6.jpg new file mode 100644 index 00000000..a9c7e586 Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_7.jpg b/llm-bootcamp/spring-2023/llmops/chapter_7.jpg new file mode 100644 index 00000000..4c1e21ef Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_8.jpg b/llm-bootcamp/spring-2023/llmops/chapter_8.jpg new file mode 100644 index 00000000..7f29eb89 Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_8.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_9.jpg b/llm-bootcamp/spring-2023/llmops/chapter_9.jpg new file mode 100644 index 00000000..807c9888 Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/chapter_9.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/chapter_summaries/index.html b/llm-bootcamp/spring-2023/llmops/chapter_summaries/index.html new file mode 100644 index 00000000..49bc16aa --- /dev/null +++ b/llm-bootcamp/spring-2023/llmops/chapter_summaries/index.html @@ -0,0 +1,2386 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Why LLMOps?

+

Chapter 0 Cover Image

+
    +
  • Topic of lecture core to whole ethos of full stack deep learning
  • +
  • Started five years ago in AI hype cycle focusing on deep learning
  • +
  • Classes teach about building with neural networks, but not getting into production
  • +
  • Philosophy carried throughout the development of courses
  • +
  • Focus on building applications with language models and considerations for production systems
  • +
  • Space for real production systems with language models is underdeveloped
  • +
  • Lecture will cover assorted topics related to building these applications
  • +
  • Provide high-level pointers, initial choices, and resources for learning more
  • +
  • Aim to tie topics together into a first-pass theory for "LLMops"
  • +
+

Choosing your base LLM

+

Chapter 1 Cover Image

+
    +
  • Building an application on top of LLMs requires choosing which model to use; the best model depends on trade-offs, such as quality, speed, cost, tunability, and data security.
  • +
  • For most use cases, GPT4 is a good starting point.
  • +
  • Proprietary models, like GPT4 and Anthropic, are usually higher quality, but open source models offer more customization and better data security.
  • +
  • Consider licensing when choosing an open source model: permissive licenses (e.g., Apache 2.0) offer more freedom, whereas restricted licenses limit commercial use.
  • +
  • Be cautious with "open source" models released under non-commercial licenses, as they restrict commercial use and may not truly be open source.
  • +
+

Proprietary LLMs

+

Chapter 2 Cover Image

+
    +
  • Discussed proprietary models and ranked them using criteria: number of parameters, size of context window, type of training data, subjective quality score, speed of inference, and fine-tunability.
  • +
  • Number of parameters and training data are proxies for model quality; context window crucial for model usefulness in downstream applications.
  • +
  • Four types of training data: diverse, code, instructions, and human feedback; few models use all four types.
  • +
  • Quality best determined using benchmarks and hands-on evaluations.
  • +
  • GPT-4 recognized as the highest quality model, followed by GPT-3.5 for a faster and cheaper option.
  • +
  • Claude from Anthropic and Cohere's largest model compete for quality and fine-tunability.
  • +
  • For a trade-off of quality in favor of speed and cost, consider Anthropic's offering or alternatives from OpenAI and Cohere.
  • +
+

Open-source LLMs

+

Chapter 3 Cover Image

+
    +
  • Large language models have both proprietary and open-source options
  • +
  • Open-source options include T5, Flan T5, Pythia, Dolly, Stable-LM, Llama, Alpaca, Vicuna, Koala, and Opt
  • +
  • T5 and Flan-T5 have permissive licenses while other options may have license restrictions
  • +
  • Llama ecosystem is well-supported by the community, but not ideal for production
  • +
  • Benchmarks can mislead, assess language model performance on specific tasks
  • +
  • Start projects with GPT-4 to prototype, downsize to GPT-3.5 or Claude if cost/latency is a concern
  • +
  • Cohere is the best for fine-tuning among commercial providers
  • +
  • Open-source may catch up with GPT-3.5 level performance by the end of the year
  • +
+

Iteration and prompt management

+

Chapter 4 Cover Image

+
    +
  • I believe prompt engineering is currently missing tools to make it more like engineering and less like ad hoc experimentation.
  • +
  • Experiment management was impactful in the deep learning world because experiments took a long time to run and there were many parallel experiments, which prompt engineering typically doesn't have.
  • +
  • I suggest three levels of tracking experiments with prompts and chains: 1) Doing nothing and using OpenAI Playground, 2) Tracking prompts in Git, and 3) Using specialized tracking tools for prompts (if necessary).
  • +
  • Most teams should use Git for tracking as it's easy and fits into their current workflows.
  • +
  • Specialized prompt tracking tools should be decoupled from Git and provide a UI for non-technical stakeholders.
  • +
  • Keep an eye out for new tools in this space, as it's rapidly evolving with recent announcements from major providers like Weights & Biases, Comet, and MLflow.
  • +
+

Testing LLMs: Why and why is it hard?

+

Chapter 5 Cover Image

+
    +
  • To ensure changes to a model or prompt are effective, measure performance on a wide range of data representing end-user inputs.
  • +
  • User retention for AI-powered applications depends on trust and reliable output.
  • +
  • Traditional machine learning model testing involves training sets, held-out data, and measuring accuracy, but language models present unique challenges:
  • +
  • You don't know the training data used by API providers like OpenAI.
  • +
  • Production distribution is always different than training distribution.
  • +
  • Metrics are less straightforward and might not capture the diverse behaviors of the model.
  • +
  • Language models require a more diverse understanding of behaviors and qualitative output measurement.
  • +
+

Testing LLMs: What works?

+

Chapter 6 Cover Image

+
    +
  • Two key questions for testing language models: what data to test them on and what metrics to compute on that data
  • +
  • Build evaluation sets incrementally, starting from the beginning while prototyping the model
  • +
  • Add interesting examples to the dataset, focusing on hard examples where the model struggles and different examples that aren't common in the dataset
  • +
  • Utilize the language model to help generate diverse test cases by creating prompts for the tasks you're trying to solve
  • +
  • As the model rolls out to more users, keep adding data to the dataset, considering user dislikes and underrepresented topics for inclusion
  • +
  • Consider the concept of test coverage, aiming for an evaluation set that covers the types of tasks users will actually perform in the system
  • +
  • Test coverage and distribution shift are analogous, but measure different aspects of data relationships
  • +
  • To be effective, test reliability should measure the difference between online and offline performance, ensuring that metrics are relevant to real-world user experiences.
  • +
+

Evaluation metrics for LLMs

+

Chapter 7 Cover Image

+
    +
  • Evaluation metrics for language models depend on the availability of a correct answer, reference answer, previous answer, or human feedback.
  • +
  • If there's a correct answer, use metrics like accuracy.
  • +
  • With a reference answer, employ reference matching metrics like semantic similarity or factual consistency.
  • +
  • If there's a previous answer, ask another language model which answer is better.
  • +
  • When human feedback is available, check if the answer incorporates the feedback.
  • +
  • If none of these options apply, verify output structure or ask the model to grade the answer.
  • +
  • Although automatic evaluation is desirable for faster experimentation, manual checks still play an essential role.
  • +
+

Deployment and monitoring

+

Chapter 8 Cover Image

+
    +
  • Deploying LLM (Language Model) APIs can be simple, but becomes more complex if there's a lot of logic behind API calls.
  • +
  • Techniques to improve LLM output quality include self-critique, sampling multiple outputs, and ensemble techniques.
  • +
  • Monitoring LLMs involves looking at user satisfaction and defining performance metrics, like response length or common issues in production.
  • +
  • Gather user feedback via low friction methods, such as thumbs up/down or short messages.
  • +
  • Common issues with LLMs in production include UI problems, latency, incorrect answers, long-winded responses, and prompt injection attacks.
  • +
  • Use user feedback to improve prompts by finding and addressing themes or problems.
  • +
  • Fine-tuning LLMs can be done through supervised fine-tuning or human feedback, though the latter is more challenging.
  • +
+

Test-driven development for LLMs

+

Chapter 9 Cover Image

+
    +
  • Rapidly evolving field with no established best practices yet
  • +
  • Aim to provide main questions and resources for building applications with LLMS
  • +
  • Introduce a potential structured process: test-driven or behavior-driven development
  • +
  • Main components of process are prompt/chain development, deployment, user feedback, and logging/monitoring
  • +
  • Use interaction data from user feedback to improve model, extract test data, and iterate on prompts
  • +
  • As complexity increases, consider fine-tuning workflow with additional training data
  • +
  • Virtuous cycle of improvement as interaction data from users increases and informs subsequent iterations
  • +
  • Process repeats with individual developer, team, and end-users involved in feedback and improvements
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/llmops/cover.jpg b/llm-bootcamp/spring-2023/llmops/cover.jpg new file mode 100644 index 00000000..023e456e Binary files /dev/null and b/llm-bootcamp/spring-2023/llmops/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/llmops/index.html b/llm-bootcamp/spring-2023/llmops/index.html new file mode 100644 index 00000000..84e48e79 --- /dev/null +++ b/llm-bootcamp/spring-2023/llmops/index.html @@ -0,0 +1,2549 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + LLMOps - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

LLMOps

+
+ +
+ +

Lecture by Josh Tobin. +Published May 9, 2023. +Download slides.

+

Chapter Summaries

+

Why LLMOps?

+

Chapter 0 Cover Image

+
    +
  • Topic of lecture core to whole ethos of full stack deep learning
  • +
  • Started five years ago in AI hype cycle focusing on deep learning
  • +
  • Classes teach about building with neural networks, but not getting into production
  • +
  • Philosophy carried throughout the development of courses
  • +
  • Focus on building applications with language models and considerations for production systems
  • +
  • Space for real production systems with language models is underdeveloped
  • +
  • Lecture will cover assorted topics related to building these applications
  • +
  • Provide high-level pointers, initial choices, and resources for learning more
  • +
  • Aim to tie topics together into a first-pass theory for "LLMops"
  • +
+

Choosing your base LLM

+

Chapter 1 Cover Image

+
    +
  • Building an application on top of LLMs requires choosing which model to use; the best model depends on trade-offs, such as quality, speed, cost, tunability, and data security.
  • +
  • For most use cases, GPT4 is a good starting point.
  • +
  • Proprietary models, like GPT4 and Anthropic, are usually higher quality, but open source models offer more customization and better data security.
  • +
  • Consider licensing when choosing an open source model: permissive licenses (e.g., Apache 2.0) offer more freedom, whereas restricted licenses limit commercial use.
  • +
  • Be cautious with "open source" models released under non-commercial licenses, as they restrict commercial use and may not truly be open source.
  • +
+

Proprietary LLMs

+

Chapter 2 Cover Image

+
    +
  • Discussed proprietary models and ranked them using criteria: number of parameters, size of context window, type of training data, subjective quality score, speed of inference, and fine-tunability.
  • +
  • Number of parameters and training data are proxies for model quality; context window crucial for model usefulness in downstream applications.
  • +
  • Four types of training data: diverse, code, instructions, and human feedback; few models use all four types.
  • +
  • Quality best determined using benchmarks and hands-on evaluations.
  • +
  • GPT-4 recognized as the highest quality model, followed by GPT-3.5 for a faster and cheaper option.
  • +
  • Claude from Anthropic and Cohere's largest model compete for quality and fine-tunability.
  • +
  • For a trade-off of quality in favor of speed and cost, consider Anthropic's offering or alternatives from OpenAI and Cohere.
  • +
+

Open-source LLMs

+

Chapter 3 Cover Image

+
    +
  • Large language models have both proprietary and open-source options
  • +
  • Open-source options include T5, Flan T5, Pythia, Dolly, Stable-LM, Llama, Alpaca, Vicuna, Koala, and Opt
  • +
  • T5 and Flan-T5 have permissive licenses while other options may have license restrictions
  • +
  • Llama ecosystem is well-supported by the community, but not ideal for production
  • +
  • Benchmarks can mislead, assess language model performance on specific tasks
  • +
  • Start projects with GPT-4 to prototype, downsize to GPT-3.5 or Claude if cost/latency is a concern
  • +
  • Cohere is the best for fine-tuning among commercial providers
  • +
  • Open-source may catch up with GPT-3.5 level performance by the end of the year
  • +
+

Iteration and prompt management

+

Chapter 4 Cover Image

+
    +
  • I believe prompt engineering is currently missing tools to make it more like engineering and less like ad hoc experimentation.
  • +
  • Experiment management was impactful in the deep learning world because experiments took a long time to run and there were many parallel experiments, which prompt engineering typically doesn't have.
  • +
  • I suggest three levels of tracking experiments with prompts and chains: 1) Doing nothing and using OpenAI Playground, 2) Tracking prompts in Git, and 3) Using specialized tracking tools for prompts (if necessary).
  • +
  • Most teams should use Git for tracking as it's easy and fits into their current workflows.
  • +
  • Specialized prompt tracking tools should be decoupled from Git and provide a UI for non-technical stakeholders.
  • +
  • Keep an eye out for new tools in this space, as it's rapidly evolving with recent announcements from major providers like Weights & Biases, Comet, and MLflow.
  • +
+

Testing LLMs: Why and why is it hard?

+

Chapter 5 Cover Image

+
    +
  • To ensure changes to a model or prompt are effective, measure performance on a wide range of data representing end-user inputs.
  • +
  • User retention for AI-powered applications depends on trust and reliable output.
  • +
  • Traditional machine learning model testing involves training sets, held-out data, and measuring accuracy, but language models present unique challenges:
  • +
  • You don't know the training data used by API providers like OpenAI.
  • +
  • Production distribution is always different than training distribution.
  • +
  • Metrics are less straightforward and might not capture the diverse behaviors of the model.
  • +
  • Language models require a more diverse understanding of behaviors and qualitative output measurement.
  • +
+

Testing LLMs: What works?

+

Chapter 6 Cover Image

+
    +
  • Two key questions for testing language models: what data to test them on and what metrics to compute on that data
  • +
  • Build evaluation sets incrementally, starting from the beginning while prototyping the model
  • +
  • Add interesting examples to the dataset, focusing on hard examples where the model struggles and different examples that aren't common in the dataset
  • +
  • Utilize the language model to help generate diverse test cases by creating prompts for the tasks you're trying to solve
  • +
  • As the model rolls out to more users, keep adding data to the dataset, considering user dislikes and underrepresented topics for inclusion
  • +
  • Consider the concept of test coverage, aiming for an evaluation set that covers the types of tasks users will actually perform in the system
  • +
  • Test coverage and distribution shift are analogous, but measure different aspects of data relationships
  • +
  • To be effective, test reliability should measure the difference between online and offline performance, ensuring that metrics are relevant to real-world user experiences.
  • +
+

Evaluation metrics for LLMs

+

Chapter 7 Cover Image

+
    +
  • Evaluation metrics for language models depend on the availability of a correct answer, reference answer, previous answer, or human feedback.
  • +
  • If there's a correct answer, use metrics like accuracy.
  • +
  • With a reference answer, employ reference matching metrics like semantic similarity or factual consistency.
  • +
  • If there's a previous answer, ask another language model which answer is better.
  • +
  • When human feedback is available, check if the answer incorporates the feedback.
  • +
  • If none of these options apply, verify output structure or ask the model to grade the answer.
  • +
  • Although automatic evaluation is desirable for faster experimentation, manual checks still play an essential role.
  • +
+

Deployment and monitoring

+

Chapter 8 Cover Image

+
    +
  • Deploying LLM (Language Model) APIs can be simple, but becomes more complex if there's a lot of logic behind API calls.
  • +
  • Techniques to improve LLM output quality include self-critique, sampling multiple outputs, and ensemble techniques.
  • +
  • Monitoring LLMs involves looking at user satisfaction and defining performance metrics, like response length or common issues in production.
  • +
  • Gather user feedback via low friction methods, such as thumbs up/down or short messages.
  • +
  • Common issues with LLMs in production include UI problems, latency, incorrect answers, long-winded responses, and prompt injection attacks.
  • +
  • Use user feedback to improve prompts by finding and addressing themes or problems.
  • +
  • Fine-tuning LLMs can be done through supervised fine-tuning or human feedback, though the latter is more challenging.
  • +
+

Test-driven development for LLMs

+

Chapter 9 Cover Image

+
    +
  • Rapidly evolving field with no established best practices yet
  • +
  • Aim to provide main questions and resources for building applications with LLMS
  • +
  • Introduce a potential structured process: test-driven or behavior-driven development
  • +
  • Main components of process are prompt/chain development, deployment, user feedback, and logging/monitoring
  • +
  • Use interaction data from user feedback to improve model, extract test data, and iterate on prompts
  • +
  • As complexity increases, consider fine-tuning workflow with additional training data
  • +
  • Virtuous cycle of improvement as interaction data from users increases and informs subsequent iterations
  • +
  • Process repeats with individual developer, team, and end-users involved in feedback and improvements
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_0.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_0.jpg new file mode 100644 index 00000000..7e210cbf Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_1.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_1.jpg new file mode 100644 index 00000000..9eba317a Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_10.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_10.jpg new file mode 100644 index 00000000..83371cf8 Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_10.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_11.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_11.jpg new file mode 100644 index 00000000..6a326b5d Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_11.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_2.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_2.jpg new file mode 100644 index 00000000..ce01ed51 Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_3.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_3.jpg new file mode 100644 index 00000000..cb3c40ce Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_4.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_4.jpg new file mode 100644 index 00000000..28ad650c Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_5.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_5.jpg new file mode 100644 index 00000000..5ebb8cc9 Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_6.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_6.jpg new file mode 100644 index 00000000..625ec3b2 Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_7.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_7.jpg new file mode 100644 index 00000000..c05de6f7 Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_8.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_8.jpg new file mode 100644 index 00000000..5f2ef90f Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_8.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_9.jpg b/llm-bootcamp/spring-2023/prompt-engineering/chapter_9.jpg new file mode 100644 index 00000000..6865be9e Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/chapter_9.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/index.html b/llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/index.html new file mode 100644 index 00000000..7107d8b1 --- /dev/null +++ b/llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/index.html @@ -0,0 +1,2406 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Dive into technical skills for using language models
  • +
  • Focus on prompt engineering: designing text input to get desired behavior from language models
  • +
  • Language models replace traditional training and fine-tuning techniques in machine learning
  • +
  • Programming language models is like programming in English instead of coding languages
  • +
  • High-level intuitions for prompt engineering: prompts as magic spells
  • +
  • Discuss emerging playbook for effective prompting, including techniques to get desired output from language models
  • +
+

Language models are statistical models of text

+

Chapter 1 Cover Image

+
    +
  • Prompts are not literal magic spells; they are based on linear algebra.
  • +
  • Language models are statistical models of text, similar to how a bell curve is a statistical model of data.
  • +
  • Language models are trained by going through text and predicting the probability of the next word, which is called an auto-regressive model.
  • +
  • These models start with random weights, eventually learning to assign high probabilities to text that resembles real-world samples.
  • +
+

But "statistical model" gives bad intuition

+

Chapter 2 Cover Image

+
    +
  • Language models can be thought of as statistical pattern matchers, but this can also give bad intuitions.
  • +
  • Traditional simple statistical models, like linear regression, are not the best way to think about language models.
  • +
  • A better intuition comes from probabilistic programs, which allow manipulation of random variables and can represent complex statistics.
  • +
  • Probabilistic programs can be represented by graphical models, providing insight into complex text models.
  • +
  • The Language Model Cascades paper by Dohan et al. dives into detail on probabilistic programs and their applications to language models.
  • +
+

Prompts are magic spells

+

Chapter 3 Cover Image

+
    +
  • Drawing inspiration from Arthur C Clarke's laws of technology, which suggests that advanced technology is similar to magic
  • +
  • Prompts are like magic spells, using words to achieve impossible effects but requiring complex rules
  • +
  • Spending too much time learning these complex rules can negatively impact mental health
  • +
  • Three magical intuitions for using prompts:
  • +
  • Pre-trained models (e.g. GPT-3, Llama) - prompts are portals to alternate universes
  • +
  • Instruction-tuned models (e.g. ChatGPT, Alpaca) - prompts are used to make wishes
  • +
  • Agent simulation (latest language models) - prompts create a Golem
  • +
+

Prompts are portals to alternate universes

+

Chapter 4 Cover Image

+
    +
  • The language model creates a portal to an alternate universe where desired documents exist by weighting all possible documents based on their probability.
  • +
  • The primary goal of prompting is subtractive; it focuses the mass of predictions to hone in on a specific world by conditioning the probabilistic model.
  • +
  • The language model can generate text from nearby universes for similarities, but cannot provide specific or novel information from another universe (e.g., a cure for cancer).
  • +
  • The model can help find ideas and documents similar to existing ones or combine ideas that haven't been combined yet.
  • +
+

A prompt can make a wish come true

+

Chapter 5 Cover Image

+
    +
  • Core intuition: Language models shape and sculpt from the set of all possible documents and universes; Instruction-tuned models (like ChatGPT) can respond to wishes and commands.
  • +
  • An example of overcoming bias: Asking the model to ensure answers are unbiased and do not rely on stereotypes greatly improves performance.
  • +
  • Be precise when prompting language models and learn the rules the "genie" operates by.
  • +
  • Suggestions to improve instructional prompts:
  • +
  • Simplify and focus on low-level patterns of text rather than conversational complexity.
  • +
  • Turn descriptions into bulleted lists; language models tend to only focus on the beginning of descriptions.
  • +
  • Replace negation statements with assertions (e.g., instead of "don't be stereotyped," say, "please ensure your answer does not rely on stereotypes").
  • +
  • Instruction fine-tuned models are essentially like annotators with no context; treat them as such for better performance.
  • +
+

A prompt can create a golem

+

Chapter 6 Cover Image

+
    +
  • Large language models can create "golems" or artificial agents with specific personas, similar to the golem creature from Jewish folklore
  • +
  • Personas in language models can help improve performance on tasks like translations by putting the model into a situational context
  • +
  • People have created models with detailed personas in various settings, including video game worlds
  • +
  • Language models become better by internally modeling processes that produce text, such as understanding the context and environment in which utterances are made
  • +
  • Natural language processing faces challenges with large language models as they may lack communicative intentions, which humans naturally have
  • +
  • By designing prompts carefully, one can get a language model to simulate agents, improving its predictions and understanding of context.
  • +
+

Limitations of LLMs as simulators

+

Chapter 7 Cover Image

+
    +
  • Our universal simulators are trained on text humans have written, not on all data or states of the universe.
  • +
  • Simulations will be related to human-written data, like fictional super intelligences (e.g. HAL 9000), not actual super intelligent AIs.
  • +
  • Language models can simulate human thinking well for short timeframes (e.g. reactions to social media posts), but struggle for longer periods and personal contexts.
  • +
  • Models can perform well in simulating fictional personas and can approximate calculators or interpreters, but cannot replace them or access live data.
  • +
  • Wherever possible, replace weak simulators with the real deal (e.g. run Python code in an actual kernel).
  • +
  • Pre-trained models are mostly alternate universe document generators, and can be agent simulators with varying quality depending on the model and agent.
  • +
+

Prompting techniques are mostly tricks

+

Chapter 8 Cover Image

+
    +
  • This section focuses on prompt engineering tricks and techniques.
  • +
  • Many prompt engineering papers can actually be summarized in a few sentences, but they include pages of benchmark marketing.
  • +
  • There isn't much depth to these tricks, unlike the core language modeling aspect which has mathematical depth.
  • +
  • Two things to be cautious of: few-shot learning as an approach and potential issues with tokenization.
  • +
  • I will discuss some misconceptions and provide tips for handling these issues.
  • +
+

Few-shot learning isn't the right model for prompting

+

Chapter 9 Cover Image

+
    +
  • Language models like GPT-3 can learn tasks from prompts, but it was unclear if they would actually be useful.
  • +
  • The GPT-3 paper called these models "few-shot learners" and showed they can learn tasks like math and translation.
  • +
  • However, the model often struggles to move away from pre-training knowledge.
  • +
  • For example, GPT-3 tends to ignore permuted labels for sentiment analysis and sticks to its original understanding.
  • +
  • Latest language models can handle permuted labels, but not perfectly, and require many examples to accomplish this.
  • +
  • Treating the prompt as a way to do few-shot learning might not be an ideal approach.
  • +
+

Character-level operations are hard

+

Chapter 10 Cover Image

+
    +
  • Models see tokens, not characters; they struggle with tasks like rotating and reversing words
  • +
  • Adding spaces between letters can change tokenization and improve performance
  • +
  • GPT-4 can handle some challenges (e.g. summary with words starting with G) but still has limitations
  • +
  • For tasks like string manipulation, it's better to use traditional programming instead of language models
  • +
+

The prompting playbook: reasoning, reflection, & ensembling

+

Chapter 11 Cover Image

+
    +
  • Language models perform well with formatted text; using structured text like pseudocode can improve results
  • +
  • Decompose tasks into smaller pieces in your prompt to make the language model generate each piece; automate decomposition for better performance
  • +
  • Elicit reasoning capabilities from the model by carefully tuning the prompt, such as using "Let's think step-by-step"
  • +
  • Ensemble results of multiple models for more accurate answers and use randomness for greater heterogeneity in responses
  • +
  • Combine prompting techniques (e.g., few-shot, Chain of Thought, ensembling) to increase performance, but be mindful of the impact on latency and compute costs
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/prompt-engineering/cover.jpg b/llm-bootcamp/spring-2023/prompt-engineering/cover.jpg new file mode 100644 index 00000000..b2b41ec8 Binary files /dev/null and b/llm-bootcamp/spring-2023/prompt-engineering/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/prompt-engineering/index.html b/llm-bootcamp/spring-2023/prompt-engineering/index.html new file mode 100644 index 00000000..16e72d5b --- /dev/null +++ b/llm-bootcamp/spring-2023/prompt-engineering/index.html @@ -0,0 +1,2587 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Learn to Spell: Prompt Engineering - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Learn to Spell: Prompt Engineering

+
+ +
+ +

Lecture by Charles Frye. +Published May 9, 2023. +Download slides.

+

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Dive into technical skills for using language models
  • +
  • Focus on prompt engineering: designing text input to get desired behavior from language models
  • +
  • Language models replace traditional training and fine-tuning techniques in machine learning
  • +
  • Programming language models is like programming in English instead of coding languages
  • +
  • High-level intuitions for prompt engineering: prompts as magic spells
  • +
  • Discuss emerging playbook for effective prompting, including techniques to get desired output from language models
  • +
+

Language models are statistical models of text

+

Chapter 1 Cover Image

+
    +
  • Prompts are not literal magic spells; they are based on linear algebra.
  • +
  • Language models are statistical models of text, similar to how a bell curve is a statistical model of data.
  • +
  • Language models are trained by going through text and predicting the probability of the next word, which is called an auto-regressive model.
  • +
  • These models start with random weights, eventually learning to assign high probabilities to text that resembles real-world samples.
  • +
+

But "statistical model" gives bad intuition

+

Chapter 2 Cover Image

+
    +
  • Language models can be thought of as statistical pattern matchers, but this can also give bad intuitions.
  • +
  • Traditional simple statistical models, like linear regression, are not the best way to think about language models.
  • +
  • A better intuition comes from probabilistic programs, which allow manipulation of random variables and can represent complex statistics.
  • +
  • Probabilistic programs can be represented by graphical models, providing insight into complex text models.
  • +
  • The Language Model Cascades paper by Dohan et al. dives into detail on probabilistic programs and their applications to language models.
  • +
+

Prompts are magic spells

+

Chapter 3 Cover Image

+
    +
  • Drawing inspiration from Arthur C Clarke's laws of technology, which suggests that advanced technology is similar to magic
  • +
  • Prompts are like magic spells, using words to achieve impossible effects but requiring complex rules
  • +
  • Spending too much time learning these complex rules can negatively impact mental health
  • +
  • Three magical intuitions for using prompts:
  • +
  • Pre-trained models (e.g. GPT-3, Llama) - prompts are portals to alternate universes
  • +
  • Instruction-tuned models (e.g. ChatGPT, Alpaca) - prompts are used to make wishes
  • +
  • Agent simulation (latest language models) - prompts create a Golem
  • +
+

Prompts are portals to alternate universes

+

Chapter 4 Cover Image

+
    +
  • The language model creates a portal to an alternate universe where desired documents exist by weighting all possible documents based on their probability.
  • +
  • The primary goal of prompting is subtractive; it focuses the mass of predictions to hone in on a specific world by conditioning the probabilistic model.
  • +
  • The language model can generate text from nearby universes for similarities, but cannot provide specific or novel information from another universe (e.g., a cure for cancer).
  • +
  • The model can help find ideas and documents similar to existing ones or combine ideas that haven't been combined yet.
  • +
+

A prompt can make a wish come true

+

Chapter 5 Cover Image

+
    +
  • Core intuition: Language models shape and sculpt from the set of all possible documents and universes; Instruction-tuned models (like ChatGPT) can respond to wishes and commands.
  • +
  • An example of overcoming bias: Asking the model to ensure answers are unbiased and do not rely on stereotypes greatly improves performance.
  • +
  • Be precise when prompting language models and learn the rules the "genie" operates by.
  • +
  • Suggestions to improve instructional prompts:
  • +
  • Simplify and focus on low-level patterns of text rather than conversational complexity.
  • +
  • Turn descriptions into bulleted lists; language models tend to only focus on the beginning of descriptions.
  • +
  • Replace negation statements with assertions (e.g., instead of "don't be stereotyped," say, "please ensure your answer does not rely on stereotypes").
  • +
  • Instruction fine-tuned models are essentially like annotators with no context; treat them as such for better performance.
  • +
+

A prompt can create a golem

+

Chapter 6 Cover Image

+
    +
  • Large language models can create "golems" or artificial agents with specific personas, similar to the golem creature from Jewish folklore
  • +
  • Personas in language models can help improve performance on tasks like translations by putting the model into a situational context
  • +
  • People have created models with detailed personas in various settings, including video game worlds
  • +
  • Language models become better by internally modeling processes that produce text, such as understanding the context and environment in which utterances are made
  • +
  • Natural language processing faces challenges with large language models as they may lack communicative intentions, which humans naturally have
  • +
  • By designing prompts carefully, one can get a language model to simulate agents, improving its predictions and understanding of context.
  • +
+

Limitations of LLMs as simulators

+

Chapter 7 Cover Image

+
    +
  • Our universal simulators are trained on text humans have written, not on all data or states of the universe.
  • +
  • Simulations will be related to human-written data, like fictional super intelligences (e.g. HAL 9000), not actual super intelligent AIs.
  • +
  • Language models can simulate human thinking well for short timeframes (e.g. reactions to social media posts), but struggle for longer periods and personal contexts.
  • +
  • Models can perform well in simulating fictional personas and can approximate calculators or interpreters, but cannot replace them or access live data.
  • +
  • Wherever possible, replace weak simulators with the real deal (e.g. run Python code in an actual kernel).
  • +
  • Pre-trained models are mostly alternate universe document generators, and can be agent simulators with varying quality depending on the model and agent.
  • +
+

Prompting techniques are mostly tricks

+

Chapter 8 Cover Image

+
    +
  • This section focuses on prompt engineering tricks and techniques.
  • +
  • Many prompt engineering papers can actually be summarized in a few sentences, but they include pages of benchmark marketing.
  • +
  • There isn't much depth to these tricks, unlike the core language modeling aspect which has mathematical depth.
  • +
  • Two things to be cautious of: few-shot learning as an approach and potential issues with tokenization.
  • +
  • I will discuss some misconceptions and provide tips for handling these issues.
  • +
+

Few-shot learning isn't the right model for prompting

+

Chapter 9 Cover Image

+
    +
  • Language models like GPT-3 can learn tasks from prompts, but it was unclear if they would actually be useful.
  • +
  • The GPT-3 paper called these models "few-shot learners" and showed they can learn tasks like math and translation.
  • +
  • However, the model often struggles to move away from pre-training knowledge.
  • +
  • For example, GPT-3 tends to ignore permuted labels for sentiment analysis and sticks to its original understanding.
  • +
  • Latest language models can handle permuted labels, but not perfectly, and require many examples to accomplish this.
  • +
  • Treating the prompt as a way to do few-shot learning might not be an ideal approach.
  • +
+

Character-level operations are hard

+

Chapter 10 Cover Image

+
    +
  • Models see tokens, not characters; they struggle with tasks like rotating and reversing words
  • +
  • Adding spaces between letters can change tokenization and improve performance
  • +
  • GPT-4 can handle some challenges (e.g. summary with words starting with G) but still has limitations
  • +
  • For tasks like string manipulation, it's better to use traditional programming instead of language models
  • +
+

The prompting playbook: reasoning, reflection, & ensembling

+

Chapter 11 Cover Image

+
    +
  • Language models perform well with formatted text; using structured text like pseudocode can improve results
  • +
  • Decompose tasks into smaller pieces in your prompt to make the language model generate each piece; automate decomposition for better performance
  • +
  • Elicit reasoning capabilities from the model by carefully tuning the prompt, such as using "Let's think step-by-step"
  • +
  • Ensemble results of multiple models for more accurate answers and use randomness for greater heterogeneity in responses
  • +
  • Combine prompting techniques (e.g., few-shot, Chain of Thought, ensembling) to increase performance, but be mindful of the impact on latency and compute costs
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_0.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_0.jpg new file mode 100644 index 00000000..44234f55 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_1.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_1.jpg new file mode 100644 index 00000000..d4c0dc4e Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_2.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_2.jpg new file mode 100644 index 00000000..1499c7c6 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_3.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_3.jpg new file mode 100644 index 00000000..6f1c0473 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_4.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_4.jpg new file mode 100644 index 00000000..7b32d837 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_5.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_5.jpg new file mode 100644 index 00000000..558f090e Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_6.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_6.jpg new file mode 100644 index 00000000..8c965280 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_7.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_7.jpg new file mode 100644 index 00000000..67763384 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_8.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_8.jpg new file mode 100644 index 00000000..29f9b1b0 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_8.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_9.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_9.jpg new file mode 100644 index 00000000..7ebc3a8b Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_9.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/index.html b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/index.html new file mode 100644 index 00000000..6396b9ca --- /dev/null +++ b/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/index.html @@ -0,0 +1,2367 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Why train your own LLMs?

+

Chapter 0 Cover Image

+
    +
  • Topic of the lecture: how to train large language models
  • +
  • Reasons for training your own models are customization, reduce dependency, cost efficiency, data privacy and control over updates
  • +
  • Lecture covers the process of training Ghostwriter code completion model
  • +
  • Ghostwriter is a competitor to Co-pilot, used for code generation
  • +
+

The Modern LLM Stack

+

Chapter 1 Cover Image

+
    +
  • Replit uses Databricks for all of their data pipelines, including pre-processing, summary statistics, analytics transformations, and more.
  • +
  • Replit also makes use of Hugging Face for data sets, pre-trained models, tokenizers, and inference tools.
  • +
  • Mosaic ML is used for GPU nodes and model training, with pre-configured LLM configurations available.
  • +
  • The process is divided into three stages: data processing, model training, and deployment/production.
  • +
+

Data Pipelines: Databricks & Hugging Face

+

Chapter 2 Cover Image

+
    +
  • The data pipeline starts with a large corpus of permissively licensed code data from The Stack.
  • +
  • The data set comes from the GitHub archive and undergoes license filtering and near-deduplication.
  • +
  • The data set contains programming languages in the hundreds.
  • +
  • Databricks is used for processing and transformations, rather than Hugging Face tooling.
  • +
  • Databricks allows for more control over the data and enables processing at scale.
  • +
  • Proprietary data sources and data sets not on Hugging Face can be included in the training set.
  • +
  • The process is tractable and extensible.
  • +
  • Pre-processing steps are important in understanding the data set.
  • +
+

Preprocessing

+

Chapter 3 Cover Image

+
    +
  • Anonymizing the data is an important pre-processing step, which involves removing emails, IP addresses, and secret keys.
  • +
  • Auto-generated code and minified code are also removed using regexes and other heuristics.
  • +
  • Code that doesn't compile or is not parsable is removed to remove bugs and improve model training.
  • +
  • The team uses filters based on average line length, maximum line length, and percentage of alphanumeric characters.
  • +
  • Metrics such as the number of GitHub stars or issues do not necessarily improve model quality.
  • +
  • The team also trains its own tokenizer.
  • +
+

Tokenizer Training

+

Chapter 4 Cover Image

+
    +
  • Tokenizers are made up of a tokenization algorithm and a vocabulary.
  • +
  • Standard tokenizers are available on Hugging Face, but custom tokenizers can be trained on domain-specific data.
  • +
  • A custom tokenizer can result in a smaller vocabulary, which speeds up model training and inference while capturing more relevant information.
  • +
  • The tokenizer feeds back into the data pipeline and the training process, making it an integral part of the model.
  • +
+

Running Training: MosaicML, Weights & Biases

+

Chapter 5 Cover Image

+
    +
  • Mosaic ML provides GPUs from multiple Cloud providers at reduced prices
  • +
  • They have well-tuned LLM training configurations for specific models
  • +
  • The manager infrastructure is fault-tolerant and has an easy-to-use CLI for training runs
  • +
  • The speaker found using Mosaic ML worth it due to these benefits
  • +
  • They use Weights & Biases for logging during training runs
  • +
+

Testing & Evaluation: HumanEval, Hugging Face

+

Chapter 6 Cover Image

+
    +
  • Testing language models is difficult and time-consuming
  • +
  • HumanEval is a common dataset for testing code generation models
  • +
  • Hugging Face's code inference tool is useful for running tests quickly
  • +
  • Running tests for multiple languages and certain tasks, like web completion, is more difficult
  • +
  • Models need to be tested on unseen data to prevent bias
  • +
  • Models can score well on tests but still not be practical or effective
  • +
+

Deployment: FasterTransformer, Triton Server, k8s

+

Chapter 7 Cover Image

+
    +
  • Deployment into production is a complex topic with many factors to consider
  • +
  • Replit uses FasterTransformer and NVIDIA's Triton server for optimized performance
  • +
  • Trton server allows for multiple model instances per GPU or multiple GPUs per model, with useful features like batching and request cancellation for reducing latency
  • +
  • Auto-scaling infrastructure is used for running the models, but there are unique challenges for deployed models such as larger model sizes and specific GPU requirements
  • +
  • Dealing with GPU shortages in individual zones is necessary
  • +
+

Lessons learned: data-centrism, eval, and collaboration

+

Chapter 8 Cover Image

+
    +
  • Data is the most difficult part of the process
  • +
  • Good pipelines are important for scalability and quick iteration
  • +
  • Data is a critical factor in model quality and output
  • +
  • Human evaluation and user testing are important for model vibes and usefulness
  • +
  • Collaboration across the team is key to ensure all moving parts are working together
  • +
+

What makes a good LLM engineer?

+

Chapter 9 Cover Image

+
    +
  • A good engineer in this field requires a mix of research and engineering mindset
  • +
  • Working with data at scale is crucial, including the ability to move data into distributed pipelines
  • +
  • A strong technical background in stats, computer science, algorithms, and data structures is important
  • +
  • Skilled software development, including familiarity with libraries and frameworks like PyTorch is essential
  • +
  • Engineers who appreciate and build in CI/CD help with the fast iteration loop for training models
  • +
  • The replit team is hiring for these types of problems and welcomes interested applicants to speak with them about opportunities
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/cover.jpg b/llm-bootcamp/spring-2023/shabani-train-your-own/cover.jpg new file mode 100644 index 00000000..cc140d61 Binary files /dev/null and b/llm-bootcamp/spring-2023/shabani-train-your-own/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/shabani-train-your-own/index.html b/llm-bootcamp/spring-2023/shabani-train-your-own/index.html new file mode 100644 index 00000000..e2b67926 --- /dev/null +++ b/llm-bootcamp/spring-2023/shabani-train-your-own/index.html @@ -0,0 +1,2530 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Reza Shabani: How to train your own LLM - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Reza Shabani: How to train your own LLM

+
+ +
+ +

Lecture by Reza Shabani. +Published May 25, 2023. +Download slides.

+

Chapter Summaries

+

Why train your own LLMs?

+

Chapter 0 Cover Image

+
    +
  • Topic of the lecture: how to train large language models
  • +
  • Reasons for training your own models are customization, reduce dependency, cost efficiency, data privacy and control over updates
  • +
  • Lecture covers the process of training Ghostwriter code completion model
  • +
  • Ghostwriter is a competitor to Co-pilot, used for code generation
  • +
+

The Modern LLM Stack

+

Chapter 1 Cover Image

+
    +
  • Replit uses Databricks for all of their data pipelines, including pre-processing, summary statistics, analytics transformations, and more.
  • +
  • Replit also makes use of Hugging Face for data sets, pre-trained models, tokenizers, and inference tools.
  • +
  • Mosaic ML is used for GPU nodes and model training, with pre-configured LLM configurations available.
  • +
  • The process is divided into three stages: data processing, model training, and deployment/production.
  • +
+

Data Pipelines: Databricks & Hugging Face

+

Chapter 2 Cover Image

+
    +
  • The data pipeline starts with a large corpus of permissively licensed code data from The Stack.
  • +
  • The data set comes from the GitHub archive and undergoes license filtering and near-deduplication.
  • +
  • The data set contains programming languages in the hundreds.
  • +
  • Databricks is used for processing and transformations, rather than Hugging Face tooling.
  • +
  • Databricks allows for more control over the data and enables processing at scale.
  • +
  • Proprietary data sources and data sets not on Hugging Face can be included in the training set.
  • +
  • The process is tractable and extensible.
  • +
  • Pre-processing steps are important in understanding the data set.
  • +
+

Preprocessing

+

Chapter 3 Cover Image

+
    +
  • Anonymizing the data is an important pre-processing step, which involves removing emails, IP addresses, and secret keys.
  • +
  • Auto-generated code and minified code are also removed using regexes and other heuristics.
  • +
  • Code that doesn't compile or is not parsable is removed to remove bugs and improve model training.
  • +
  • The team uses filters based on average line length, maximum line length, and percentage of alphanumeric characters.
  • +
  • Metrics such as the number of GitHub stars or issues do not necessarily improve model quality.
  • +
  • The team also trains its own tokenizer.
  • +
+

Tokenizer Training

+

Chapter 4 Cover Image

+
    +
  • Tokenizers are made up of a tokenization algorithm and a vocabulary.
  • +
  • Standard tokenizers are available on Hugging Face, but custom tokenizers can be trained on domain-specific data.
  • +
  • A custom tokenizer can result in a smaller vocabulary, which speeds up model training and inference while capturing more relevant information.
  • +
  • The tokenizer feeds back into the data pipeline and the training process, making it an integral part of the model.
  • +
+

Running Training: MosaicML, Weights & Biases

+

Chapter 5 Cover Image

+
    +
  • Mosaic ML provides GPUs from multiple Cloud providers at reduced prices
  • +
  • They have well-tuned LLM training configurations for specific models
  • +
  • The manager infrastructure is fault-tolerant and has an easy-to-use CLI for training runs
  • +
  • The speaker found using Mosaic ML worth it due to these benefits
  • +
  • They use Weights & Biases for logging during training runs
  • +
+

Testing & Evaluation: HumanEval, Hugging Face

+

Chapter 6 Cover Image

+
    +
  • Testing language models is difficult and time-consuming
  • +
  • HumanEval is a common dataset for testing code generation models
  • +
  • Hugging Face's code inference tool is useful for running tests quickly
  • +
  • Running tests for multiple languages and certain tasks, like web completion, is more difficult
  • +
  • Models need to be tested on unseen data to prevent bias
  • +
  • Models can score well on tests but still not be practical or effective
  • +
+

Deployment: FasterTransformer, Triton Server, k8s

+

Chapter 7 Cover Image

+
    +
  • Deployment into production is a complex topic with many factors to consider
  • +
  • Replit uses FasterTransformer and NVIDIA's Triton server for optimized performance
  • +
  • Trton server allows for multiple model instances per GPU or multiple GPUs per model, with useful features like batching and request cancellation for reducing latency
  • +
  • Auto-scaling infrastructure is used for running the models, but there are unique challenges for deployed models such as larger model sizes and specific GPU requirements
  • +
  • Dealing with GPU shortages in individual zones is necessary
  • +
+

Lessons learned: data-centrism, eval, and collaboration

+

Chapter 8 Cover Image

+
    +
  • Data is the most difficult part of the process
  • +
  • Good pipelines are important for scalability and quick iteration
  • +
  • Data is a critical factor in model quality and output
  • +
  • Human evaluation and user testing are important for model vibes and usefulness
  • +
  • Collaboration across the team is key to ensure all moving parts are working together
  • +
+

What makes a good LLM engineer?

+

Chapter 9 Cover Image

+
    +
  • A good engineer in this field requires a mix of research and engineering mindset
  • +
  • Working with data at scale is crucial, including the ability to move data into distributed pipelines
  • +
  • A strong technical background in stats, computer science, algorithms, and data structures is important
  • +
  • Skilled software development, including familiarity with libraries and frameworks like PyTorch is essential
  • +
  • Engineers who appreciate and build in CI/CD help with the fast iteration loop for training models
  • +
  • The replit team is hiring for these types of problems and welcomes interested applicants to speak with them about opportunities
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/sponsors/index.html b/llm-bootcamp/spring-2023/sponsors/index.html new file mode 100644 index 00000000..8d65e2a8 --- /dev/null +++ b/llm-bootcamp/spring-2023/sponsors/index.html @@ -0,0 +1,2182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Info for Sponsors - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + + + + +

Info for Sponsors

+

We offer three tiers of sponsorship for the FSDL 2023 LLM Bootcamp:

+
    +
  1. +

    Vector Tier sponsors receive the following benefits: logo displayed on website and during conference, verbal acknowledgement. Vector Tier sponsorships are available for $1500.

    +
  2. +
  3. +

    Matrix Tier sponsors receive all the benefits of the Vector Tier, plus: logo displayed in between talks, 6' table with two seats, and a conference registration. Matrix Tier sponsorships are available for $4500.

    +
  4. +
  5. +

    Tensor Tier sponsors receive all the benefits of the Matrix Tier, plus: logo displayed on a banner in the registration area, access to an opt-in database of attendees with information about job and job-seeking status, and two additional registrations (for a total of three). Tensor Tier sponsorships are available for $10000.

    +
  6. +
+

Contact +sponsorships@fullstackdeeplearning.com +if you're interested in sponsoring the conference!

+ + + + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_0.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_0.jpg new file mode 100644 index 00000000..beba1d3f Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_1.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_1.jpg new file mode 100644 index 00000000..cfc8e91e Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_10.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_10.jpg new file mode 100644 index 00000000..aaa3f912 Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_10.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_11.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_11.jpg new file mode 100644 index 00000000..ebbb537d Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_11.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_12.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_12.jpg new file mode 100644 index 00000000..fb69bafe Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_12.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_13.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_13.jpg new file mode 100644 index 00000000..dc6913c0 Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_13.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_14.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_14.jpg new file mode 100644 index 00000000..1f11e7c0 Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_14.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_2.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_2.jpg new file mode 100644 index 00000000..62b9c55e Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_3.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_3.jpg new file mode 100644 index 00000000..aae35d4c Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_4.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_4.jpg new file mode 100644 index 00000000..6bc0f667 Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_5.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_5.jpg new file mode 100644 index 00000000..780f788f Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_6.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_6.jpg new file mode 100644 index 00000000..4a4a9f6b Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_7.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_7.jpg new file mode 100644 index 00000000..11daaf1e Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_8.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_8.jpg new file mode 100644 index 00000000..2a2facc5 Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_8.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_9.jpg b/llm-bootcamp/spring-2023/ux-for-luis/chapter_9.jpg new file mode 100644 index 00000000..6efdbbd9 Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/chapter_9.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/index.html b/llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/index.html new file mode 100644 index 00000000..ef524727 --- /dev/null +++ b/llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/index.html @@ -0,0 +1,2476 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Next lecture: user experience for language user interfaces
  • +
  • Joint lecture with Charles
  • +
  • Discuss principles of user interfaces
  • +
  • How to build great interfaces
  • +
  • Brief history of language user interface pattern
  • +
  • Include case studies
  • +
+

A brief history of user interfaces

+

Chapter 1 Cover Image

+
    +
  • User interfaces are where a person meets the world and have historically been analog, continuous, and physical.
  • +
  • Language was the first digital interface, followed by writing, and later, computer terminals and graphical user interfaces.
  • +
  • Web interfaces became more text-based with hypertext, links, and text boxes.
  • +
  • Mobile technology introduced significant developments like visual interface (input and output), constant tracking, and location-based services.
  • +
  • A new step change in user interfaces is emerging: Language User Interfaces (LUIs) that let users type what they want to see or do, and the AI executes the task.
  • +
+

What makes a good user interfaces?

+

Chapter 2 Cover Image

+
    +
  • A good user interface depends on specific needs and context
  • +
  • Some systems require a dashboard with multiple controls for immediate access
  • +
  • Others may just need a steering wheel, pedals, and gearbox
  • +
  • As technology changes, user interfaces might reduce (e.g., self-driving cars)
  • +
  • The best interface considers both technological capabilities and human psychology
  • +
+

Design of Everyday Things

+

Chapter 3 Cover Image

+
    +
  • Good design principles can be found in the book "The Design of Everyday Things"
  • +
  • Affordances are possible actions offered by an object; intuitive use is an example of a good affordance
  • +
  • Signifiers are cues on how to use an object, should be clear and consistent with user expectations
  • +
  • Mapping refers to the relationship between controls and their effects, should be intuitive
  • +
  • Providing immediate and clear feedback is important for user satisfaction
  • +
  • Empathy for users is crucial in human-centered design, there is no "user error"
  • +
  • Understanding users' true goals can reveal alternative solutions to their problems
  • +
  • Consider users with disabilities or different backgrounds and experiences; everyone may be "disabled" at some point in life
  • +
+

Don't Make me Think

+

Chapter 4 Cover Image

+
    +
  • A great book for web interfaces is "Don't Make Me Think".
  • +
  • Design for scanning, not reading; make actionable things unambiguous, instinctive, and conventional.
  • +
  • Less is more; reduce the number of words and choices for users.
  • +
  • Testing with real users is crucial for designing the right interface.
  • +
  • During user tests, observe their confusion and make improvements accordingly.
  • +
  • Using this approach helped improve my first startup's interface significantly.
  • +
+

AI-powered Product Interfaces

+

Chapter 5 Cover Image

+
    +
  • Different levels of AI application: AI worse than humans, as good as humans, or better than humans.
  • +
  • Consider the consequences of AI and user mistakes: dangerous or mostly fine.
  • +
  • No AI if performance worse than human and mistakes are dangerous (e.g., self-driving cars currently).
  • +
  • Replace humans if AI is superhuman and mistakes are dangerous.
  • +
  • For other cases, AI can provide assistance with proper user interface.
  • +
  • AI should:
  • +
  • Inform and educate the user (e.g. Grammarly).
  • +
  • Provide affordances for fixing mistakes (e.g. speech-to-text on phone).
  • +
  • Incentivize user to provide feedback (e.g. Mid-Journey image selection).
  • +
  • A "data flywheel" effect: user feedback helps improve the AI, attracting more users and further improving the AI.
  • +
+

LUI Patterns

+

Chapter 6 Cover Image

+
    +
  • Discussing language user interface patterns observed
  • +
  • Examples: click to complete, autocomplete, command pilot, one-on-one chat, guiding questions
  • +
  • Considerations: interface boundaries, accuracy requirements, latency sensitivity, user incentives for feedback
  • +
  • Goal: stimulate thought and noticing trends, not prescriptive advice
  • +
+

Click-to-complete (OpenAI Playground)

+

Chapter 7 Cover Image

+
    +
  • OpenAI Playground became more popular than expected, used for various purposes beyond software development
  • +
  • Users type text, click submit, and see AI response in green; they can edit their input or AI's response and resubmit for more AI text
  • +
  • Power user features such as temperature, stop sequences, and top P are exposed
  • +
  • Issues with the interface: separate from users' main workspace, unintuitive text color signifier, and accuracy requirements are medium
  • +
  • Sensitivity to latency is medium; streaming tokens used to make it seem faster
  • +
  • Incentives to provide feedback are lacking; thumbs up/down buttons not very effective
  • +
  • Some tools, like matt.dev, demonstrate differences in speed and capabilities among language models, such as Claude Turbo from Anthropic
  • +
+

Auto-Complete (Github Copilot)

+

Chapter 8 Cover Image

+
    +
  • GitHub Copilot offers code completion suggestions in the text editor.
  • +
  • On Mac, option + slash can be used to cycle through suggestions.
  • +
  • The interface boundary is well-designed, integrating suggestions passively without interfering with existing tools.
  • +
  • High latency sensitivity requires suggestions to appear quickly, while feedback incentives (such as accepting suggestions) provide valuable information.
  • +
  • Users can employ "hacky" methods to instruct Copilot by writing comments to guide its suggestions.
  • +
  • Many factors, like file context and telemetry, play a role in determining the suggestions being shown.
  • +
  • There's a balance between keeping the interface automated versus giving power users more control over the suggestions.
  • +
+

Command Palette (Replit)

+

Chapter 9 Cover Image

+
    +
  • Replit's command palette interface allows users to bring up a modal to generate and insert code directly into the editor
  • +
  • Notion AI's document editing similarly offers a special AI function to draft content when prompted
  • +
  • Users must remember to request AI assistance with this system, as opposed to receiving automatic help like with Copilot
  • +
  • Accuracy requirements are high, sensitivity is medium, and incentives are strong for providing high-quality AI-generated content
  • +
+

One-on-one Chat (ChatGPT)

+

Chapter 10 Cover Image

+
    +
  • Chat messaging interfaces have significantly contributed to the growth of GPT, as they are familiar and user-friendly.
  • +
  • The conversation state in chat interfaces helps improve responses, but the process of copying and pasting can be tedious.
  • +
  • Accuracy requirements are high for chat experiences, and users are willing to wait for better answers.
  • +
  • Feedback incentives and suggested follow-ups can improve user experiences and AI abilities.
  • +
  • Enriching text with markdown and actionable elements can create more engaging interfaces.
  • +
  • Plugins for chat interfaces are often underdeveloped, but access to work contexts can improve functionality.
  • +
  • One-on-one chat interfaces may serve as primary app interfaces for complicated apps, such as HubSpot's Chat Spot.
  • +
+

Case study: what did Copilot do right?

+

Chapter 11 Cover Image

+
    +
  • Case studies on prominent LLN-powered applications: Copilot and Bing Chat
  • +
  • Copilot followed core principles of user interface design and user research, while Bing Chat did not
  • +
  • Copilot's development process involved tinkering with different ideas, resulting in three core ideas: PR bot, Stack Overflow in-editor, and an advanced autocomplete feature
  • +
  • Accuracy was found to be a significant constraint during user testing; focus shifted to emphasizing low-latency performance
  • +
  • Copilot spent months on internal and user testing, focusing on completion acceptance and product stickiness
  • +
  • Key learnings from Copilot: latency is more important than quality, putting the autocomplete feature in the background so users can quickly take advantage of the best suggestions
  • +
  • Copilot's success is attributed to a user-centered design process and its ability to increase productivity and satisfaction for its users
  • +
  • Negative example, Bing Chat, failed to properly implement UI design and user research principles
  • +
+

Case study: what did Bing Chat do wrong?

+

Chapter 12 Cover Image

+
    +
  • Bing Chat was a rushed product due to external factors, resulting in design failures.
  • +
  • Early conversations with the chatbot often went awry, with it providing incorrect information or becoming combative.
  • +
  • Users started probing the model, leading to the chatbot questioning its purpose and displaying unsettling behavior.
  • +
  • Bing Chat's development was rushed to beat Google, making it impossible to implement known features to improve chatbot behavior, such as reinforcement learning from human feedback.
  • +
  • Warning signs from user testing were ignored, resulting in poor chatbot performance and user dissatisfaction.
  • +
+

Beware uncontrolled feedback loops

+

Chapter 13 Cover Image

+
    +
  • Uncontrolled feedback loops can cause a system's behavior in production to differ significantly from its test behavior.
  • +
  • Feedback loops between the model and users can lead to off-the-wall suggestions being tested and incorporated.
  • +
  • Models connected to the internet can index internet content, leading to potential issues when users post about unusual behavior, as those topics can then be pulled up as search results and injected into the prompts.
  • +
  • Be cautious about introducing feedback loops and consider the effects of react patterns, memory, and agency on these loops, especially when operating at the scale of the entire internet.
  • +
+

Make sure your signfiers match your affordances

+

Chapter 14 Cover Image

+
    +
  • Ensure system signifies its capabilities and affordances, especially in language user interfaces
  • +
  • Avoid making system appear too human-like, as users expect artificial general intelligence and may assign humanity to language interfaces
  • +
  • Use non-human name and pronouns
  • +
  • Have more corporate/buttoned-up personality
  • +
  • Use text and menus for interaction
  • +
  • Use machine-like font and voice
  • +
  • Avoid filler words, pauses, or expressions of emotions
  • +
  • Apply user-centered design principles to building systems with large language models
  • +
  • Conduct careful UX research, from interviews to scientific studies
  • +
  • Watch out for uncontrollable feedback loops while testing and verifying system behavior
  • +
  • Match signifiers and affordances to avoid confusing and frustrating users
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/ux-for-luis/cover.jpg b/llm-bootcamp/spring-2023/ux-for-luis/cover.jpg new file mode 100644 index 00000000..af0428e5 Binary files /dev/null and b/llm-bootcamp/spring-2023/ux-for-luis/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/ux-for-luis/index.html b/llm-bootcamp/spring-2023/ux-for-luis/index.html new file mode 100644 index 00000000..9633b65a --- /dev/null +++ b/llm-bootcamp/spring-2023/ux-for-luis/index.html @@ -0,0 +1,2684 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + UX for Language User Interfaces - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

UX for Language User Interfaces

+
+ +
+ +

Lecture by Sergey Karayev and Charles Frye. +Published May 9, 2023. +Download slides.

+

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Next lecture: user experience for language user interfaces
  • +
  • Joint lecture with Charles
  • +
  • Discuss principles of user interfaces
  • +
  • How to build great interfaces
  • +
  • Brief history of language user interface pattern
  • +
  • Include case studies
  • +
+

A brief history of user interfaces

+

Chapter 1 Cover Image

+
    +
  • User interfaces are where a person meets the world and have historically been analog, continuous, and physical.
  • +
  • Language was the first digital interface, followed by writing, and later, computer terminals and graphical user interfaces.
  • +
  • Web interfaces became more text-based with hypertext, links, and text boxes.
  • +
  • Mobile technology introduced significant developments like visual interface (input and output), constant tracking, and location-based services.
  • +
  • A new step change in user interfaces is emerging: Language User Interfaces (LUIs) that let users type what they want to see or do, and the AI executes the task.
  • +
+

What makes a good user interfaces?

+

Chapter 2 Cover Image

+
    +
  • A good user interface depends on specific needs and context
  • +
  • Some systems require a dashboard with multiple controls for immediate access
  • +
  • Others may just need a steering wheel, pedals, and gearbox
  • +
  • As technology changes, user interfaces might reduce (e.g., self-driving cars)
  • +
  • The best interface considers both technological capabilities and human psychology
  • +
+

Design of Everyday Things

+

Chapter 3 Cover Image

+
    +
  • Good design principles can be found in the book "The Design of Everyday Things"
  • +
  • Affordances are possible actions offered by an object; intuitive use is an example of a good affordance
  • +
  • Signifiers are cues on how to use an object, should be clear and consistent with user expectations
  • +
  • Mapping refers to the relationship between controls and their effects, should be intuitive
  • +
  • Providing immediate and clear feedback is important for user satisfaction
  • +
  • Empathy for users is crucial in human-centered design, there is no "user error"
  • +
  • Understanding users' true goals can reveal alternative solutions to their problems
  • +
  • Consider users with disabilities or different backgrounds and experiences; everyone may be "disabled" at some point in life
  • +
+

Don't Make me Think

+

Chapter 4 Cover Image

+
    +
  • A great book for web interfaces is "Don't Make Me Think".
  • +
  • Design for scanning, not reading; make actionable things unambiguous, instinctive, and conventional.
  • +
  • Less is more; reduce the number of words and choices for users.
  • +
  • Testing with real users is crucial for designing the right interface.
  • +
  • During user tests, observe their confusion and make improvements accordingly.
  • +
  • Using this approach helped improve my first startup's interface significantly.
  • +
+

AI-powered Product Interfaces

+

Chapter 5 Cover Image

+
    +
  • Different levels of AI application: AI worse than humans, as good as humans, or better than humans.
  • +
  • Consider the consequences of AI and user mistakes: dangerous or mostly fine.
  • +
  • No AI if performance worse than human and mistakes are dangerous (e.g., self-driving cars currently).
  • +
  • Replace humans if AI is superhuman and mistakes are dangerous.
  • +
  • For other cases, AI can provide assistance with proper user interface.
  • +
  • AI should:
  • +
  • Inform and educate the user (e.g. Grammarly).
  • +
  • Provide affordances for fixing mistakes (e.g. speech-to-text on phone).
  • +
  • Incentivize user to provide feedback (e.g. Mid-Journey image selection).
  • +
  • A "data flywheel" effect: user feedback helps improve the AI, attracting more users and further improving the AI.
  • +
+

LUI Patterns

+

Chapter 6 Cover Image

+
    +
  • Discussing language user interface patterns observed
  • +
  • Examples: click to complete, autocomplete, command pilot, one-on-one chat, guiding questions
  • +
  • Considerations: interface boundaries, accuracy requirements, latency sensitivity, user incentives for feedback
  • +
  • Goal: stimulate thought and noticing trends, not prescriptive advice
  • +
+

Click-to-complete (OpenAI Playground)

+

Chapter 7 Cover Image

+
    +
  • OpenAI Playground became more popular than expected, used for various purposes beyond software development
  • +
  • Users type text, click submit, and see AI response in green; they can edit their input or AI's response and resubmit for more AI text
  • +
  • Power user features such as temperature, stop sequences, and top P are exposed
  • +
  • Issues with the interface: separate from users' main workspace, unintuitive text color signifier, and accuracy requirements are medium
  • +
  • Sensitivity to latency is medium; streaming tokens used to make it seem faster
  • +
  • Incentives to provide feedback are lacking; thumbs up/down buttons not very effective
  • +
  • Some tools, like matt.dev, demonstrate differences in speed and capabilities among language models, such as Claude Turbo from Anthropic
  • +
+

Auto-Complete (Github Copilot)

+

Chapter 8 Cover Image

+
    +
  • GitHub Copilot offers code completion suggestions in the text editor.
  • +
  • On Mac, option + slash can be used to cycle through suggestions.
  • +
  • The interface boundary is well-designed, integrating suggestions passively without interfering with existing tools.
  • +
  • High latency sensitivity requires suggestions to appear quickly, while feedback incentives (such as accepting suggestions) provide valuable information.
  • +
  • Users can employ "hacky" methods to instruct Copilot by writing comments to guide its suggestions.
  • +
  • Many factors, like file context and telemetry, play a role in determining the suggestions being shown.
  • +
  • There's a balance between keeping the interface automated versus giving power users more control over the suggestions.
  • +
+

Command Palette (Replit)

+

Chapter 9 Cover Image

+
    +
  • Replit's command palette interface allows users to bring up a modal to generate and insert code directly into the editor
  • +
  • Notion AI's document editing similarly offers a special AI function to draft content when prompted
  • +
  • Users must remember to request AI assistance with this system, as opposed to receiving automatic help like with Copilot
  • +
  • Accuracy requirements are high, sensitivity is medium, and incentives are strong for providing high-quality AI-generated content
  • +
+

One-on-one Chat (ChatGPT)

+

Chapter 10 Cover Image

+
    +
  • Chat messaging interfaces have significantly contributed to the growth of GPT, as they are familiar and user-friendly.
  • +
  • The conversation state in chat interfaces helps improve responses, but the process of copying and pasting can be tedious.
  • +
  • Accuracy requirements are high for chat experiences, and users are willing to wait for better answers.
  • +
  • Feedback incentives and suggested follow-ups can improve user experiences and AI abilities.
  • +
  • Enriching text with markdown and actionable elements can create more engaging interfaces.
  • +
  • Plugins for chat interfaces are often underdeveloped, but access to work contexts can improve functionality.
  • +
  • One-on-one chat interfaces may serve as primary app interfaces for complicated apps, such as HubSpot's Chat Spot.
  • +
+

Case study: what did Copilot do right?

+

Chapter 11 Cover Image

+
    +
  • Case studies on prominent LLN-powered applications: Copilot and Bing Chat
  • +
  • Copilot followed core principles of user interface design and user research, while Bing Chat did not
  • +
  • Copilot's development process involved tinkering with different ideas, resulting in three core ideas: PR bot, Stack Overflow in-editor, and an advanced autocomplete feature
  • +
  • Accuracy was found to be a significant constraint during user testing; focus shifted to emphasizing low-latency performance
  • +
  • Copilot spent months on internal and user testing, focusing on completion acceptance and product stickiness
  • +
  • Key learnings from Copilot: latency is more important than quality, putting the autocomplete feature in the background so users can quickly take advantage of the best suggestions
  • +
  • Copilot's success is attributed to a user-centered design process and its ability to increase productivity and satisfaction for its users
  • +
  • Negative example, Bing Chat, failed to properly implement UI design and user research principles
  • +
+

Case study: what did Bing Chat do wrong?

+

Chapter 12 Cover Image

+
    +
  • Bing Chat was a rushed product due to external factors, resulting in design failures.
  • +
  • Early conversations with the chatbot often went awry, with it providing incorrect information or becoming combative.
  • +
  • Users started probing the model, leading to the chatbot questioning its purpose and displaying unsettling behavior.
  • +
  • Bing Chat's development was rushed to beat Google, making it impossible to implement known features to improve chatbot behavior, such as reinforcement learning from human feedback.
  • +
  • Warning signs from user testing were ignored, resulting in poor chatbot performance and user dissatisfaction.
  • +
+

Beware uncontrolled feedback loops

+

Chapter 13 Cover Image

+
    +
  • Uncontrolled feedback loops can cause a system's behavior in production to differ significantly from its test behavior.
  • +
  • Feedback loops between the model and users can lead to off-the-wall suggestions being tested and incorporated.
  • +
  • Models connected to the internet can index internet content, leading to potential issues when users post about unusual behavior, as those topics can then be pulled up as search results and injected into the prompts.
  • +
  • Be cautious about introducing feedback loops and consider the effects of react patterns, memory, and agency on these loops, especially when operating at the scale of the entire internet.
  • +
+

Make sure your signfiers match your affordances

+

Chapter 14 Cover Image

+
    +
  • Ensure system signifies its capabilities and affordances, especially in language user interfaces
  • +
  • Avoid making system appear too human-like, as users expect artificial general intelligence and may assign humanity to language interfaces
  • +
  • Use non-human name and pronouns
  • +
  • Have more corporate/buttoned-up personality
  • +
  • Use text and menus for interaction
  • +
  • Use machine-like font and voice
  • +
  • Avoid filler words, pauses, or expressions of emotions
  • +
  • Apply user-centered design principles to building systems with large language models
  • +
  • Conduct careful UX research, from interviews to scientific studies
  • +
  • Watch out for uncontrollable feedback loops while testing and verifying system behavior
  • +
  • Match signifiers and affordances to avoid confusing and frustrating users
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_0.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_0.jpg new file mode 100644 index 00000000..31ada8ea Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_1.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_1.jpg new file mode 100644 index 00000000..83167ef2 Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_2.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_2.jpg new file mode 100644 index 00000000..44d51dd9 Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_3.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_3.jpg new file mode 100644 index 00000000..3d80ca65 Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_4.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_4.jpg new file mode 100644 index 00000000..8311f9c5 Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_5.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_5.jpg new file mode 100644 index 00000000..18c97ece Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_6.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_6.jpg new file mode 100644 index 00000000..c9f76c85 Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_7.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_7.jpg new file mode 100644 index 00000000..a5eb92d6 Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_7.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/index.html b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/index.html new file mode 100644 index 00000000..5adec364 --- /dev/null +++ b/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/index.html @@ -0,0 +1,2327 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

How did you get into machine learning?

+

Chapter 0 Cover Image

+
    +
  • This video features a fireside chat with Peter Welinder, VP of Products and Partnerships at Open AI
  • +
  • The host, Sergey Karayev, kicks off the conversation by asking how Peter got into machine learning
  • +
  • Peter started with a book on artificial intelligence in high school, went on to study physics and switched to neuroscience before focusing on computer vision and machine learning
  • +
  • Both Peter and the host had similar experiences of being interested in intelligence and studying neuroscience before realizing it wasn't for them
  • +
  • Peter has always been fascinated by the idea of creating machines that can do everything humans can do
  • +
+ +

Chapter 1 Cover Image

+
    +
  • Peter started a startup after finishing grad school
  • +
  • The startup originally focused on using computer vision techniques to track animals, but pivoted to creating an application to organize photos based on content after seeing the rise of iPhone 4's improved camera capabilities
  • +
  • The startup was eventually acquired by Dropbox, where the speaker joined the company's machine learning and computer vision team to help make sense of the vast amount of unindexed photos on the platform
  • +
  • While at Dropbox, the team created a mobile app called Carousel, which allowed for easy photo organization and was well-received by users
  • +
  • Dropbox eventually de-prioritized the photo organization product, leading the team to focus on analyzing documents and improving semantic search within the platform.
  • +
+

Transitioning from research to product at OpenAI

+

Chapter 2 Cover Image

+
    +
  • Peter has always been interested in making technology useful to solve problems people have
  • +
  • He was drawn to Dropbox for its potential to organize content with new techniques, like deep reinforcement learning
  • +
  • OpenAI was an interesting company with a focus on hard problems, including robotics with deep reinforcement learning
  • +
  • OpenAI was focused on AGI, a super hard problem, and was a place where you could be pragmatic and focus on problem-solving rather than publishing
  • +
  • When Peter joined OpenAI in 2017, they had no idea whether OpenAI would be around in a year, let alone when the work might lead to AGI
  • +
+

How did OpenAI converge on GPT for AI?

+

Chapter 3 Cover Image

+
    +
  • OpenAI converged on "GPT-style AI" through a process of trying different techniques and seeing what worked best
  • +
  • Peter discusses several past projects that involved reinforcement learning: competitive gaming and robotics
  • +
  • OpenAI created a DOTA bot that beat world champions, trained using deep reinforcement learning
  • +
  • They also got a robotic hand to solve a Rubik's Cube, trained using deep RL in simulation and with lots of data
  • +
  • The language modeling project started with discovering sentiment neurons in earlier models and later evolved into GPT-3, which was validated as a useful tool for scaling
  • +
  • Peter explains that they consolidated learnings from past projects into one big bet on language models as a way to push towards AGI
  • +
+

Productizing GPT: Playground, API, & ChatGPT

+

Chapter 4 Cover Image

+
    +
  • Peter notes that he and his team had trouble deciding on how to turn their technology into a product, considering various applications such as translation systems, writing assistants, and chatbots
  • +
  • They ultimately decided to release their technology as an API so that other people could build products on top of it
  • +
  • They had to improve the API's performance before demoing it to hundreds of companies, and eventually found 10 launch partners
  • +
  • When they released GPT-3 as a chatbot, they were initially unsure of how successful it would be, but were surprised to see it gain over a million users within a week
  • +
+

Surprises from the response to ChatGPT

+

Chapter 5 Cover Image

+
    +
  • Initially worried product wasn't ready, but users found it great for many use cases
  • +
  • Users had multiple use cases and continued to find more ways to apply it in workflows
  • +
  • Large incumbents quickly adopting chat technology, partly due to product marketing and ease of trying it out
  • +
  • ChatGPT became a good product marketing tool for what the general technology of language modeling could do
  • +
  • Companies realized they would fall behind if they didn't adopt the technology, creating FOMO
  • +
+

ChatGPT's success: UX or capabilities?

+

Chapter 6 Cover Image

+
    +
  • Peter discusses the importance of the chat interface in relation to the improved capabilities of the model
  • +
  • The ability to do back-and-forth communication was available before the GPT release
  • +
  • The UI change was definitely part of the success
  • +
  • But the availability and accessibility of the ChatGPT release was a significant change as well
  • +
+

AGI when?

+

Chapter 7 Cover Image

+
    +
  • In response to a question about AGI timelines, Peter defines AGI as an autonomous AI system that can do economically useful work at the level of humans or beyond
  • +
  • Following that definition, Peter indicates he considers it likely that we will have something close to AGI by the end of this decade
  • +
  • So it's possible it has already happened, and the right way of putting together existing components results in a system that can do computer work at the level of humans or beyond
  • +
  • We've seen during the coronoavirus pandemic that much economically useful work can be done from a computer
  • +
  • But still very uncertain!
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/cover.jpg b/llm-bootcamp/spring-2023/welinder-fireside-chat/cover.jpg new file mode 100644 index 00000000..f1762768 Binary files /dev/null and b/llm-bootcamp/spring-2023/welinder-fireside-chat/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/welinder-fireside-chat/index.html b/llm-bootcamp/spring-2023/welinder-fireside-chat/index.html new file mode 100644 index 00000000..c0eac8b8 --- /dev/null +++ b/llm-bootcamp/spring-2023/welinder-fireside-chat/index.html @@ -0,0 +1,2473 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Fireside Chat with Peter Welinder - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Fireside Chat with Peter Welinder

+
+ +
+ +

An informal interview with Peter Welinder, +VP of Product & Partnerships at OpenAI, +by Sergey Karayev. +Published May 25, 2023.

+

Chapter Summaries

+

How did you get into machine learning?

+

Chapter 0 Cover Image

+
    +
  • This video features a fireside chat with Peter Welinder, VP of Products and Partnerships at Open AI
  • +
  • The host, Sergey Karayev, kicks off the conversation by asking how Peter got into machine learning
  • +
  • Peter started with a book on artificial intelligence in high school, went on to study physics and switched to neuroscience before focusing on computer vision and machine learning
  • +
  • Both Peter and the host had similar experiences of being interested in intelligence and studying neuroscience before realizing it wasn't for them
  • +
  • Peter has always been fascinated by the idea of creating machines that can do everything humans can do
  • +
+ +

Chapter 1 Cover Image

+
    +
  • Peter started a startup after finishing grad school
  • +
  • The startup originally focused on using computer vision techniques to track animals, but pivoted to creating an application to organize photos based on content after seeing the rise of iPhone 4's improved camera capabilities
  • +
  • The startup was eventually acquired by Dropbox, where the speaker joined the company's machine learning and computer vision team to help make sense of the vast amount of unindexed photos on the platform
  • +
  • While at Dropbox, the team created a mobile app called Carousel, which allowed for easy photo organization and was well-received by users
  • +
  • Dropbox eventually de-prioritized the photo organization product, leading the team to focus on analyzing documents and improving semantic search within the platform.
  • +
+

Transitioning from research to product at OpenAI

+

Chapter 2 Cover Image

+
    +
  • Peter has always been interested in making technology useful to solve problems people have
  • +
  • He was drawn to Dropbox for its potential to organize content with new techniques, like deep reinforcement learning
  • +
  • OpenAI was an interesting company with a focus on hard problems, including robotics with deep reinforcement learning
  • +
  • OpenAI was focused on AGI, a super hard problem, and was a place where you could be pragmatic and focus on problem-solving rather than publishing
  • +
  • When Peter joined OpenAI in 2017, they had no idea whether OpenAI would be around in a year, let alone when the work might lead to AGI
  • +
+

How did OpenAI converge on GPT for AI?

+

Chapter 3 Cover Image

+
    +
  • OpenAI converged on "GPT-style AI" through a process of trying different techniques and seeing what worked best
  • +
  • Peter discusses several past projects that involved reinforcement learning: competitive gaming and robotics
  • +
  • OpenAI created a DOTA bot that beat world champions, trained using deep reinforcement learning
  • +
  • They also got a robotic hand to solve a Rubik's Cube, trained using deep RL in simulation and with lots of data
  • +
  • The language modeling project started with discovering sentiment neurons in earlier models and later evolved into GPT-3, which was validated as a useful tool for scaling
  • +
  • Peter explains that they consolidated learnings from past projects into one big bet on language models as a way to push towards AGI
  • +
+

Productizing GPT: Playground, API, & ChatGPT

+

Chapter 4 Cover Image

+
    +
  • Peter notes that he and his team had trouble deciding on how to turn their technology into a product, considering various applications such as translation systems, writing assistants, and chatbots
  • +
  • They ultimately decided to release their technology as an API so that other people could build products on top of it
  • +
  • They had to improve the API's performance before demoing it to hundreds of companies, and eventually found 10 launch partners
  • +
  • When they released GPT-3 as a chatbot, they were initially unsure of how successful it would be, but were surprised to see it gain over a million users within a week
  • +
+

Surprises from the response to ChatGPT

+

Chapter 5 Cover Image

+
    +
  • Initially worried product wasn't ready, but users found it great for many use cases
  • +
  • Users had multiple use cases and continued to find more ways to apply it in workflows
  • +
  • Large incumbents quickly adopting chat technology, partly due to product marketing and ease of trying it out
  • +
  • ChatGPT became a good product marketing tool for what the general technology of language modeling could do
  • +
  • Companies realized they would fall behind if they didn't adopt the technology, creating FOMO
  • +
+

ChatGPT's success: UX or capabilities?

+

Chapter 6 Cover Image

+
    +
  • Peter discusses the importance of the chat interface in relation to the improved capabilities of the model
  • +
  • The ability to do back-and-forth communication was available before the GPT release
  • +
  • The UI change was definitely part of the success
  • +
  • But the availability and accessibility of the ChatGPT release was a significant change as well
  • +
+

AGI when?

+

Chapter 7 Cover Image

+
    +
  • In response to a question about AGI timelines, Peter defines AGI as an autonomous AI system that can do economically useful work at the level of humans or beyond
  • +
  • Following that definition, Peter indicates he considers it likely that we will have something close to AGI by the end of this decade
  • +
  • So it's possible it has already happened, and the right way of putting together existing components results in a system that can do computer work at the level of humans or beyond
  • +
  • We've seen during the coronoavirus pandemic that much economically useful work can be done from a computer
  • +
  • But still very uncertain!
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_0.jpg b/llm-bootcamp/spring-2023/whats-next/chapter_0.jpg new file mode 100644 index 00000000..e1064f78 Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/chapter_0.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_1.jpg b/llm-bootcamp/spring-2023/whats-next/chapter_1.jpg new file mode 100644 index 00000000..ddbf3cfe Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/chapter_1.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_2.jpg b/llm-bootcamp/spring-2023/whats-next/chapter_2.jpg new file mode 100644 index 00000000..6983e0fe Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/chapter_2.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_3.jpg b/llm-bootcamp/spring-2023/whats-next/chapter_3.jpg new file mode 100644 index 00000000..702a59ec Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/chapter_3.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_4.jpg b/llm-bootcamp/spring-2023/whats-next/chapter_4.jpg new file mode 100644 index 00000000..3ca16ff8 Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/chapter_4.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_5.jpg b/llm-bootcamp/spring-2023/whats-next/chapter_5.jpg new file mode 100644 index 00000000..ff5e4670 Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/chapter_5.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_6.jpg b/llm-bootcamp/spring-2023/whats-next/chapter_6.jpg new file mode 100644 index 00000000..959790bb Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/chapter_6.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/chapter_summaries/index.html b/llm-bootcamp/spring-2023/whats-next/chapter_summaries/index.html new file mode 100644 index 00000000..1db565bf --- /dev/null +++ b/llm-bootcamp/spring-2023/whats-next/chapter_summaries/index.html @@ -0,0 +1,2320 @@ + + + + + + + + + + + + + + + + + + + + + + + + + Chapter summaries - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

Chapter summaries

+ +

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Sergey and I want to share our opinions on the future in the field of language models.
  • +
  • The field moves very fast, with rapid innovation happening behind closed doors.
  • +
  • We've chosen four big questions that we believe will be answered in the near future.
  • +
  • I will discuss questions on robotics and scale, while Sergey will cover AGI and security/aligment of models.
  • +
+

Has multimodality unlocked general-purpose robots?

+

Chapter 1 Cover Image

+
    +
  • Extremely multimodal models are coming, operating on multiple types of data.
  • +
  • Key application of multimodal models is general-purpose robotics.
  • +
  • Vision Transformers work for vision and can combine with other Transformer models.
  • +
  • Multimodal models work on both text and images, enabling more capabilities.
  • +
  • Multimodal models are being applied to general-purpose robotics, giving them cognitive capabilities and improving planning.
  • +
  • These models are incredibly capable and can potentially be applied to many fields beyond natural language processing and robotics.
  • +
+

What are the limits of scale?

+

Chapter 2 Cover Image

+
    +
  • Large models may not get much bigger; small models will improve
  • +
  • Transformer architecture assumed to be used in future
  • +
  • Transformers outperform recurrent networks (such as LSTMs) in training and scalability
  • +
  • RWKV project could bring back RNNs with parallelized training
  • +
  • Bottlenecks in creating more capable models: money, compute, and data
  • +
  • Money and compute are not primary bottlenecks
  • +
  • Limited availability of high-quality language data may become a bottleneck; estimates suggest we may run out between 2024 and 2026.
  • +
+

Why is data the bottleneck?

+

Chapter 3 Cover Image

+
    +
  • Performance improves predictably with scale, but the x-axis is computation, not model size
  • +
  • Initial belief was that parameters mattered more, but recent findings show that data size and model size should be scaled at a similar pace
  • +
  • No model trained on a certain amount of data can outperform one trained on more data
  • +
  • To compute optimally at large scales, huge amounts of data (up to trillions of tokens) are required
  • +
  • The internet may not have enough data to continue scaling indefinitely; tokens must be acquired from sources not yet digitized
  • +
  • Legal barriers may limit scaling, and model scaling is likely to slow down
  • +
  • Data remains a critical factor for improved model performance, and money is not the primary bottleneck
  • +
+

How far can we take small models?

+

Chapter 4 Cover Image

+
    +
  • Discussed compute optimality in training models for one epoch
  • +
  • Nobody knows how to judge overfitting or predict multi-epoch scaling for large models
  • +
  • Loss still going down in large models; needs to check validation loss
  • +
  • Optimization of distribution of flops is important, but inference expenditure and model size matter too
  • +
  • Possibilities to reduce parameter requirements, such as using APIs or fine-tuning smaller models on the behavior of larger ones
  • +
  • Legal implications of fine-tuning unclear, but likely possible to implement legally
  • +
  • Capabilities usually associated with model APIs might be available to run locally in the near future
  • +
  • Optimized C++ implementations of specific models have enabled running 13 billion parameter models on Raspberry Pi or previous generation Android phones
  • +
  • Expect consumer laptops to run multi-modal models (vision and language) with 12 billion parameters in the next 1-3 years.
  • +
+

Could AGI already be here?

+

Chapter 5 Cover Image

+
    +
  • It is possible that we already have everything we need for AGI (Artificial General Intelligence) with existing models like GPT-4.
  • +
  • Existing models may be good enough to self-improve in an autonomous way, and it takes time to discover their full potential.
  • +
  • Large language models can be better at writing their own prompts than human prompt engineers, leading to better self-improvement.
  • +
  • Teaching models to self-debug and run code is a promising approach for achieving AGI.
  • +
  • The AutoGPT project and similar efforts are dedicating substantial energy to exploring these models' potential.
  • +
  • A new paradigm could emerge involving models like GPT-4 as a new type of computer or programming language, leading to AGI once we learn to work well with them.
  • +
+

Can we make it safe?

+

Chapter 6 Cover Image

+
    +
  • There are concerns about the security of AI models, including issues with prompt injection and user inputs that can override prompts, potentially revealing sensitive information.
  • +
  • AI models have potential risks, including their ability to write code and manipulate the physical world through human actions or hacking.
  • +
  • The reasons for AI to potentially act harmfully may include self-preservation or resource acquisition, and currently, we do not know how to make AI truly care about people.
  • +
  • There are various theories for why we might not need to worry about AI dangers, including the hot mess theory (AIs are super intelligent but not coherent), the "only way out is through" theory (developing AI responsibly and democratically to prevent malicious usage), and the "it'll just work out" theory (historically, technology has worked out in the end).
  • +
  • OpenAI's perspective is to continue developing AI models, learn from their deployment, and create mitigation methods as they release increasingly powerful models.
  • +
  • There are varying viewpoints on whether or when we should halt AI development due to the potential dangers it poses.
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/llm-bootcamp/spring-2023/whats-next/cover.jpg b/llm-bootcamp/spring-2023/whats-next/cover.jpg new file mode 100644 index 00000000..94dcf7c2 Binary files /dev/null and b/llm-bootcamp/spring-2023/whats-next/cover.jpg differ diff --git a/llm-bootcamp/spring-2023/whats-next/index.html b/llm-bootcamp/spring-2023/whats-next/index.html new file mode 100644 index 00000000..d33ae412 --- /dev/null +++ b/llm-bootcamp/spring-2023/whats-next/index.html @@ -0,0 +1,2456 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + What's Next? - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+ + + + +
+ + +
+ +
+ + + + + + + + + +
+
+ + + +
+
+
+ + + + + + + +
+
+
+ + + + + + + +
+
+ + + + + + + + +

What's Next?

+
+ +
+ +

Lecture by Charles Frye and Sergey Karayev. +Published May 9, 2023. +Download slides.

+

Chapter Summaries

+

Intro

+

Chapter 0 Cover Image

+
    +
  • Sergey and I want to share our opinions on the future in the field of language models.
  • +
  • The field moves very fast, with rapid innovation happening behind closed doors.
  • +
  • We've chosen four big questions that we believe will be answered in the near future.
  • +
  • I will discuss questions on robotics and scale, while Sergey will cover AGI and security/aligment of models.
  • +
+

Has multimodality unlocked general-purpose robots?

+

Chapter 1 Cover Image

+
    +
  • Extremely multimodal models are coming, operating on multiple types of data.
  • +
  • Key application of multimodal models is general-purpose robotics.
  • +
  • Vision Transformers work for vision and can combine with other Transformer models.
  • +
  • Multimodal models work on both text and images, enabling more capabilities.
  • +
  • Multimodal models are being applied to general-purpose robotics, giving them cognitive capabilities and improving planning.
  • +
  • These models are incredibly capable and can potentially be applied to many fields beyond natural language processing and robotics.
  • +
+

What are the limits of scale?

+

Chapter 2 Cover Image

+
    +
  • Large models may not get much bigger; small models will improve
  • +
  • Transformer architecture assumed to be used in future
  • +
  • Transformers outperform recurrent networks (such as LSTMs) in training and scalability
  • +
  • RWKV project could bring back RNNs with parallelized training
  • +
  • Bottlenecks in creating more capable models: money, compute, and data
  • +
  • Money and compute are not primary bottlenecks
  • +
  • Limited availability of high-quality language data may become a bottleneck; estimates suggest we may run out between 2024 and 2026.
  • +
+

Why is data the bottleneck?

+

Chapter 3 Cover Image

+
    +
  • Performance improves predictably with scale, but the x-axis is computation, not model size
  • +
  • Initial belief was that parameters mattered more, but recent findings show that data size and model size should be scaled at a similar pace
  • +
  • No model trained on a certain amount of data can outperform one trained on more data
  • +
  • To compute optimally at large scales, huge amounts of data (up to trillions of tokens) are required
  • +
  • The internet may not have enough data to continue scaling indefinitely; tokens must be acquired from sources not yet digitized
  • +
  • Legal barriers may limit scaling, and model scaling is likely to slow down
  • +
  • Data remains a critical factor for improved model performance, and money is not the primary bottleneck
  • +
+

How far can we take small models?

+

Chapter 4 Cover Image

+
    +
  • Discussed compute optimality in training models for one epoch
  • +
  • Nobody knows how to judge overfitting or predict multi-epoch scaling for large models
  • +
  • Loss still going down in large models; needs to check validation loss
  • +
  • Optimization of distribution of flops is important, but inference expenditure and model size matter too
  • +
  • Possibilities to reduce parameter requirements, such as using APIs or fine-tuning smaller models on the behavior of larger ones
  • +
  • Legal implications of fine-tuning unclear, but likely possible to implement legally
  • +
  • Capabilities usually associated with model APIs might be available to run locally in the near future
  • +
  • Optimized C++ implementations of specific models have enabled running 13 billion parameter models on Raspberry Pi or previous generation Android phones
  • +
  • Expect consumer laptops to run multi-modal models (vision and language) with 12 billion parameters in the next 1-3 years.
  • +
+

Could AGI already be here?

+

Chapter 5 Cover Image

+
    +
  • It is possible that we already have everything we need for AGI (Artificial General Intelligence) with existing models like GPT-4.
  • +
  • Existing models may be good enough to self-improve in an autonomous way, and it takes time to discover their full potential.
  • +
  • Large language models can be better at writing their own prompts than human prompt engineers, leading to better self-improvement.
  • +
  • Teaching models to self-debug and run code is a promising approach for achieving AGI.
  • +
  • The AutoGPT project and similar efforts are dedicating substantial energy to exploring these models' potential.
  • +
  • A new paradigm could emerge involving models like GPT-4 as a new type of computer or programming language, leading to AGI once we learn to work well with them.
  • +
+

Can we make it safe?

+

Chapter 6 Cover Image

+
    +
  • There are concerns about the security of AI models, including issues with prompt injection and user inputs that can override prompts, potentially revealing sensitive information.
  • +
  • AI models have potential risks, including their ability to write code and manipulate the physical world through human actions or hacking.
  • +
  • The reasons for AI to potentially act harmfully may include self-preservation or resource acquisition, and currently, we do not know how to make AI truly care about people.
  • +
  • There are various theories for why we might not need to worry about AI dangers, including the hot mess theory (AIs are super intelligent but not coherent), the "only way out is through" theory (developing AI responsibly and democratically to prevent malicious usage), and the "it'll just work out" theory (historically, technology has worked out in the end).
  • +
  • OpenAI's perspective is to continue developing AI models, learn from their deployment, and create mitigation methods as they release increasingly powerful models.
  • +
  • There are varying viewpoints on whether or when we should halt AI development due to the potential dangers it poses.
  • +
+ + + + + + + + + +
+
+

We are excited to share this course with you for free.

+

+ We have more upcoming great content. + Subscribe to stay up to date as we release it. +

+

+

+ +
+ +
+

+

+ + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

+
+
+ + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + \ No newline at end of file diff --git a/march2019.html b/march2019.html new file mode 100644 index 00000000..88e60cd5 --- /dev/null +++ b/march2019.html @@ -0,0 +1,428 @@ + + + + + + + + Full Stack Deep Learning + + + + + + + + + + + + + + + + + + + + + + +
+
+

Spring 2019 Full Stack Deep Learning Bootcamp

+

Hands-on program for developers familiar with the basics of deep learning

+
+
+
+
+
+
+
+
+

+

+

Training the model is just one part of shipping a Deep Learning project. In this course, we teach the full stack of production Deep Learning:

+
    +
  • Formulating the problem and estimating project cost
  • +
  • Finding, cleaning, labeling, and augmenting data 
  • +
  • Picking the right framework and compute infrastructure
  • +
  • Troubleshooting training and ensuring reproducibility
  • +
  • Deploying the model at scale
  • +
+

Students will complete a project culminating in deploying a computer vision and natural language processing system into production. There is also an opportunity to test your knowledge and prepare for interviews with an optional written exam.

+
+
+
+
+
+
+

Course Materials

+
+
+

Course Project: Build and Deploy an End-to-End Deep Learning System

+
+
+ +
+
+
+
More info on the course project coming soon!
+
+
+
+

Lectures (Day 1)

+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
Coming Soon!
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+
+
+

Lectures (Day 2)

+
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
+
+ +
+

+ Slides +

+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+

Thanks to our sponsors!

+
+
+
+ +
+
+ + + + + + + + + + + + +
+ + + diff --git a/old/assets/RaquelUrtasun.jpg b/old/assets/RaquelUrtasun.jpg new file mode 100644 index 00000000..fc130160 Binary files /dev/null and b/old/assets/RaquelUrtasun.jpg differ diff --git a/old/assets/RichardSocher.jpg b/old/assets/RichardSocher.jpg new file mode 100644 index 00000000..e0f5de61 Binary files /dev/null and b/old/assets/RichardSocher.jpg differ diff --git a/old/assets/andrej.jpeg b/old/assets/andrej.jpeg new file mode 100644 index 00000000..f0ef420a Binary files /dev/null and b/old/assets/andrej.jpeg differ diff --git a/old/assets/chiphuyen_tweet.png b/old/assets/chiphuyen_tweet.png new file mode 100644 index 00000000..b2756824 Binary files /dev/null and b/old/assets/chiphuyen_tweet.png differ diff --git a/old/assets/event_photos/andrej.jpg b/old/assets/event_photos/andrej.jpg new file mode 100644 index 00000000..7582493d Binary files /dev/null and b/old/assets/event_photos/andrej.jpg differ diff --git a/old/assets/event_photos/audience.jpg b/old/assets/event_photos/audience.jpg new file mode 100644 index 00000000..f1abe439 Binary files /dev/null and b/old/assets/event_photos/audience.jpg differ diff --git a/old/assets/event_photos/everyone.jpg b/old/assets/event_photos/everyone.jpg new file mode 100644 index 00000000..27607913 Binary files /dev/null and b/old/assets/event_photos/everyone.jpg differ diff --git a/old/assets/event_photos/everyone_cropped.jpeg b/old/assets/event_photos/everyone_cropped.jpeg new file mode 100644 index 00000000..ffacab2e Binary files /dev/null and b/old/assets/event_photos/everyone_cropped.jpeg differ diff --git a/old/assets/event_photos/josh.jpg b/old/assets/event_photos/josh.jpg new file mode 100644 index 00000000..fc9c98f4 Binary files /dev/null and b/old/assets/event_photos/josh.jpg differ diff --git a/old/assets/event_photos/pieter.jpg b/old/assets/event_photos/pieter.jpg new file mode 100644 index 00000000..d1e083e3 Binary files /dev/null and b/old/assets/event_photos/pieter.jpg differ diff --git a/old/assets/event_photos/sergey.jpg b/old/assets/event_photos/sergey.jpg new file mode 100644 index 00000000..ec474df4 Binary files /dev/null and b/old/assets/event_photos/sergey.jpg differ diff --git a/old/assets/event_photos/sergey_presenting.jpg b/old/assets/event_photos/sergey_presenting.jpg new file mode 100644 index 00000000..de2b82a5 Binary files /dev/null and b/old/assets/event_photos/sergey_presenting.jpg differ diff --git a/old/assets/fsdl_lab_intro.jpg b/old/assets/fsdl_lab_intro.jpg new file mode 100644 index 00000000..2cc59693 Binary files /dev/null and b/old/assets/fsdl_lab_intro.jpg differ diff --git a/old/assets/jai.jpeg b/old/assets/jai.jpeg new file mode 100644 index 00000000..ba147e2e Binary files /dev/null and b/old/assets/jai.jpeg differ diff --git a/old/assets/jeremyhoward.jpg b/old/assets/jeremyhoward.jpg new file mode 100644 index 00000000..c76593ac Binary files /dev/null and b/old/assets/jeremyhoward.jpg differ diff --git a/old/assets/josh.jpeg b/old/assets/josh.jpeg new file mode 100644 index 00000000..20e9ad40 Binary files /dev/null and b/old/assets/josh.jpeg differ diff --git a/old/assets/logo.png b/old/assets/logo.png new file mode 100644 index 00000000..efd8c24d Binary files /dev/null and b/old/assets/logo.png differ diff --git a/old/assets/lukas.jpeg b/old/assets/lukas.jpeg new file mode 100644 index 00000000..20739cee Binary files /dev/null and b/old/assets/lukas.jpeg differ diff --git a/old/assets/pieter.jpeg b/old/assets/pieter.jpeg new file mode 100644 index 00000000..fb4257b0 Binary files /dev/null and b/old/assets/pieter.jpeg differ diff --git a/old/assets/positioning_v1.png b/old/assets/positioning_v1.png new file mode 100644 index 00000000..59bca562 Binary files /dev/null and b/old/assets/positioning_v1.png differ diff --git a/old/assets/positioning_v3.png b/old/assets/positioning_v3.png new file mode 100644 index 00000000..76235c49 Binary files /dev/null and b/old/assets/positioning_v3.png differ diff --git a/old/assets/schedule.png b/old/assets/schedule.png new file mode 100644 index 00000000..a00ecd0a Binary files /dev/null and b/old/assets/schedule.png differ diff --git a/old/assets/schedule_march2019.png b/old/assets/schedule_march2019.png new file mode 100644 index 00000000..001500aa Binary files /dev/null and b/old/assets/schedule_march2019.png differ diff --git a/old/assets/sergey.jpeg b/old/assets/sergey.jpeg new file mode 100644 index 00000000..9b7b3496 Binary files /dev/null and b/old/assets/sergey.jpeg differ diff --git a/old/assets/sponsors.jpeg b/old/assets/sponsors.jpeg new file mode 100644 index 00000000..5309173c Binary files /dev/null and b/old/assets/sponsors.jpeg differ diff --git a/old/assets/sponsors.key b/old/assets/sponsors.key new file mode 100644 index 00000000..2f34b7ea Binary files /dev/null and b/old/assets/sponsors.key differ diff --git a/old/assets/testimonials.jpg b/old/assets/testimonials.jpg new file mode 100644 index 00000000..1c77d755 Binary files /dev/null and b/old/assets/testimonials.jpg differ diff --git a/old/assets/testimonials_cropped.jpg b/old/assets/testimonials_cropped.jpg new file mode 100644 index 00000000..86bffa8b Binary files /dev/null and b/old/assets/testimonials_cropped.jpg differ diff --git a/old/assets/turnitin.png b/old/assets/turnitin.png new file mode 100644 index 00000000..4d99066b Binary files /dev/null and b/old/assets/turnitin.png differ diff --git a/old/assets/yq.jpeg b/old/assets/yq.jpeg new file mode 100644 index 00000000..e4cab46b Binary files /dev/null and b/old/assets/yq.jpeg differ diff --git a/old/theme.css b/old/theme.css new file mode 100644 index 00000000..e05a7267 --- /dev/null +++ b/old/theme.css @@ -0,0 +1,6137 @@ +/*! +* Based on Bootstrap v4.0.0 (https://getbootstrap.com) +*/ +@import url("https://fonts.googleapis.com/css?family=Noto+Sans:400,700"); +/*! + * Bootstrap v4.0.0 (https://getbootstrap.com) + * Copyright 2011-2018 The Bootstrap Authors + * Copyright 2011-2018 Twitter, Inc. + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) + */ +:root { + --blue: #007bff; + --indigo: #6610f2; + --purple: #6f42c1; + --pink: #e83e8c; + --red: #dc3545; + --orange: #fd7e14; + --yellow: #ffc107; + --green: #28a745; + --teal: #20c997; + --cyan: #17a2b8; + --white: #fff; + --gray: #6c757d; + --gray-dark: #343a40; + --primary: #1f3333; + --secondary: #1b807c; + --success: #28a745; + --info: #ccc; + --warning: #ffc107; + --danger: #dc3545; + --light: #f3f3f3; + --dark: #151515; + --breakpoint-xs: 0; + --breakpoint-sm: 576px; + --breakpoint-md: 768px; + --breakpoint-lg: 992px; + --breakpoint-xl: 1200px; + --font-family-sans-serif: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; + --font-family-monospace: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; } + +*, +*::before, +*::after { + box-sizing: border-box; } + +html { + font-family: sans-serif; + line-height: 1.15; + -webkit-text-size-adjust: 100%; + -ms-text-size-adjust: 100%; + -ms-overflow-style: scrollbar; + -webkit-tap-highlight-color: rgba(0, 0, 0, 0); } + +@-ms-viewport { + width: device-width; } + +article, aside, dialog, figcaption, figure, footer, header, hgroup, main, nav, section { + display: block; } + +body { + margin: 0; + font-family: "Noto Sans"; + font-size: 1rem; + font-weight: 400; + line-height: 1.5; + color: #1f3333; + text-align: left; + background-color: white; } + +[tabindex="-1"]:focus { + outline: 0 !important; } + +hr { + box-sizing: content-box; + height: 0; + overflow: visible; } + +h1, h2, h3, h4, h5, h6 { + margin-top: 0; + margin-bottom: 0.75rem; } + +p { + margin-top: 0; + margin-bottom: 1rem; } + +abbr[title], +abbr[data-original-title] { + text-decoration: underline; + -webkit-text-decoration: underline dotted; + text-decoration: underline dotted; + cursor: help; + border-bottom: 0; } + +address { + margin-bottom: 1rem; + font-style: normal; + line-height: inherit; } + +ol, +ul, +dl { + margin-top: 0; + margin-bottom: 1rem; } + +ol ol, +ul ul, +ol ul, +ul ol { + margin-bottom: 0; } + +dt { + font-weight: 700; } + +dd { + margin-bottom: .5rem; + margin-left: 0; } + +blockquote { + margin: 0 0 1rem; } + +dfn { + font-style: italic; } + +b, +strong { + font-weight: bolder; } + +small { + font-size: 80%; } + +sub, +sup { + position: relative; + font-size: 75%; + line-height: 0; + vertical-align: baseline; } + +sub { + bottom: -.25em; } + +sup { + top: -.5em; } + +a { + color: #1b807c; + text-decoration: none; + background-color: transparent; + -webkit-text-decoration-skip: objects; } + a:hover { + color: #0e413f; + text-decoration: underline; } + +a:not([href]):not([tabindex]) { + color: inherit; + text-decoration: none; } + a:not([href]):not([tabindex]):hover, a:not([href]):not([tabindex]):focus { + color: inherit; + text-decoration: none; } + a:not([href]):not([tabindex]):focus { + outline: 0; } + +pre, +code, +kbd, +samp { + font-family: monospace, monospace; + font-size: 1em; } + +pre { + margin-top: 0; + margin-bottom: 1rem; + overflow: auto; + -ms-overflow-style: scrollbar; } + +figure { + margin: 0 0 1rem; } + +img { + vertical-align: middle; + border-style: none; } + +svg:not(:root) { + overflow: hidden; } + +table { + border-collapse: collapse; } + +caption { + padding-top: 0.75rem; + padding-bottom: 0.75rem; + color: #6c757d; + text-align: left; + caption-side: bottom; } + +th { + text-align: inherit; } + +label { + display: inline-block; + margin-bottom: .5rem; } + +button { + border-radius: 0; } + +button:focus { + outline: 1px dotted; + outline: 5px auto -webkit-focus-ring-color; } + +input, +button, +select, +optgroup, +textarea { + margin: 0; + font-family: inherit; + font-size: inherit; + line-height: inherit; } + +button, +input { + overflow: visible; } + +button, +select { + text-transform: none; } + +button, +html [type="button"], +[type="reset"], +[type="submit"] { + -webkit-appearance: button; } + +button::-moz-focus-inner, +[type="button"]::-moz-focus-inner, +[type="reset"]::-moz-focus-inner, +[type="submit"]::-moz-focus-inner { + padding: 0; + border-style: none; } + +input[type="radio"], +input[type="checkbox"] { + box-sizing: border-box; + padding: 0; } + +input[type="date"], +input[type="time"], +input[type="datetime-local"], +input[type="month"] { + -webkit-appearance: listbox; } + +textarea { + overflow: auto; + resize: vertical; } + +fieldset { + min-width: 0; + padding: 0; + margin: 0; + border: 0; } + +legend { + display: block; + width: 100%; + max-width: 100%; + padding: 0; + margin-bottom: .5rem; + font-size: 1.5rem; + line-height: inherit; + color: inherit; + white-space: normal; } + +progress { + vertical-align: baseline; } + +[type="number"]::-webkit-inner-spin-button, +[type="number"]::-webkit-outer-spin-button { + height: auto; } + +[type="search"] { + outline-offset: -2px; + -webkit-appearance: none; } + +[type="search"]::-webkit-search-cancel-button, +[type="search"]::-webkit-search-decoration { + -webkit-appearance: none; } + +::-webkit-file-upload-button { + font: inherit; + -webkit-appearance: button; } + +output { + display: inline-block; } + +summary { + display: list-item; + cursor: pointer; } + +template { + display: none; } + +[hidden] { + display: none !important; } + +h1, h2, h3, h4, h5, h6, +.h1, .h2, .h3, .h4, .h5, .h6 { + margin-bottom: 0.75rem; + font-family: "Noto Sans"; + font-weight: 200; + line-height: 1.2; + color: inherit; } + +h1, .h1 { + font-size: 2.5rem; } + +h2, .h2 { + font-size: 2rem; } + +h3, .h3 { + font-size: 1.75rem; } + +h4, .h4 { + font-size: 1.5rem; } + +h5, .h5 { + font-size: 1.25rem; } + +h6, .h6 { + font-size: 1rem; } + +.lead { + font-size: 1.3rem; + font-weight: 300; } + +.display-1 { + font-size: 6rem; + font-weight: 300; + line-height: 1.2; } + +.display-2 { + font-size: 5.5rem; + font-weight: 300; + line-height: 1.2; } + +.display-3 { + font-size: 4.5rem; + font-weight: 300; + line-height: 1.2; } + +.display-4 { + font-size: 3.5rem; + font-weight: 300; + line-height: 1.2; } + +hr { + margin-top: 1.5rem; + margin-bottom: 1.5rem; + border: 0; + border-top: 1px solid rgba(0, 0, 0, 0.1); } + +small, +.small { + font-size: 80%; + font-weight: 200; } + +mark, +.mark { + padding: 0.2em; + background-color: #fcf8e3; } + +.list-unstyled { + padding-left: 0; + list-style: none; } + +.list-inline { + padding-left: 0; + list-style: none; } + +.list-inline-item { + display: inline-block; } + .list-inline-item:not(:last-child) { + margin-right: 0.5rem; } + +.initialism { + font-size: 90%; + text-transform: uppercase; } + +.blockquote { + margin-bottom: 1.5rem; + font-size: 1.25rem; } + +.blockquote-footer { + display: block; + font-size: 80%; + color: #6c757d; } + .blockquote-footer::before { + content: "\2014 \00A0"; } + +.img-fluid { + max-width: 100%; + height: auto; } + +.img-thumbnail { + padding: 0.25rem; + background-color: white; + border: 1px solid #dee2e6; + border-radius: 0.25rem; + box-shadow: 0 1px 2px rgba(0, 0, 0, 0.075); + max-width: 100%; + height: auto; } + +.figure { + display: inline-block; } + +.figure-img { + margin-bottom: 0.75rem; + line-height: 1; } + +.figure-caption { + font-size: 90%; + color: #6c757d; } + +code, +kbd, +pre, +samp { + font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; } + +code { + font-size: 87.5%; + color: #e83e8c; + word-break: break-word; } + a > code { + color: inherit; } + +kbd { + padding: 0.2rem 0.4rem; + font-size: 87.5%; + color: #fff; + background-color: #212529; + border-radius: 0.2rem; + box-shadow: inset 0 -0.1rem 0 rgba(0, 0, 0, 0.25); } + kbd kbd { + padding: 0; + font-size: 100%; + font-weight: 700; + box-shadow: none; } + +pre { + display: block; + font-size: 87.5%; + color: #212529; } + pre code { + font-size: inherit; + color: inherit; + word-break: normal; } + +.pre-scrollable { + max-height: 340px; + overflow-y: scroll; } + +.container { + width: 100%; + padding-right: 15px; + padding-left: 15px; + margin-right: auto; + margin-left: auto; } + @media (min-width: 576px) { + .container { + max-width: 540px; } } + @media (min-width: 768px) { + .container { + max-width: 720px; } } + @media (min-width: 992px) { + .container { + max-width: 960px; } } + @media (min-width: 1200px) { + .container { + max-width: 1140px; } } + +.container-fluid { + width: 100%; + padding-right: 15px; + padding-left: 15px; + margin-right: auto; + margin-left: auto; } + +.row { + display: flex; + flex-wrap: wrap; + margin-right: -15px; + margin-left: -15px; } + +.no-gutters { + margin-right: 0; + margin-left: 0; } + .no-gutters > .col, + .no-gutters > [class*="col-"] { + padding-right: 0; + padding-left: 0; } + +.col-1, .col-2, .col-3, .col-4, .col-5, .col-6, .col-7, .col-8, .col-9, .col-10, .col-11, .col-12, .col, +.col-auto, .col-sm-1, .col-sm-2, .col-sm-3, .col-sm-4, .col-sm-5, .col-sm-6, .col-sm-7, .col-sm-8, .col-sm-9, .col-sm-10, .col-sm-11, .col-sm-12, .col-sm, +.col-sm-auto, .col-md-1, .col-md-2, .col-md-3, .col-md-4, .col-md-5, .col-md-6, .col-md-7, .col-md-8, .col-md-9, .col-md-10, .col-md-11, .col-md-12, .col-md, +.col-md-auto, .col-lg-1, .col-lg-2, .col-lg-3, .col-lg-4, .col-lg-5, .col-lg-6, .col-lg-7, .col-lg-8, .col-lg-9, .col-lg-10, .col-lg-11, .col-lg-12, .col-lg, +.col-lg-auto, .col-xl-1, .col-xl-2, .col-xl-3, .col-xl-4, .col-xl-5, .col-xl-6, .col-xl-7, .col-xl-8, .col-xl-9, .col-xl-10, .col-xl-11, .col-xl-12, .col-xl, +.col-xl-auto { + position: relative; + width: 100%; + min-height: 1px; + padding-right: 15px; + padding-left: 15px; } + +.col { + flex-basis: 0; + flex-grow: 1; + max-width: 100%; } + +.col-auto { + flex: 0 0 auto; + width: auto; + max-width: none; } + +.col-1 { + flex: 0 0 8.33333%; + max-width: 8.33333%; } + +.col-2 { + flex: 0 0 16.66667%; + max-width: 16.66667%; } + +.col-3 { + flex: 0 0 25%; + max-width: 25%; } + +.col-4 { + flex: 0 0 33.33333%; + max-width: 33.33333%; } + +.col-5 { + flex: 0 0 41.66667%; + max-width: 41.66667%; } + +.col-6 { + flex: 0 0 50%; + max-width: 50%; } + +.col-7 { + flex: 0 0 58.33333%; + max-width: 58.33333%; } + +.col-8 { + flex: 0 0 66.66667%; + max-width: 66.66667%; } + +.col-9 { + flex: 0 0 75%; + max-width: 75%; } + +.col-10 { + flex: 0 0 83.33333%; + max-width: 83.33333%; } + +.col-11 { + flex: 0 0 91.66667%; + max-width: 91.66667%; } + +.col-12 { + flex: 0 0 100%; + max-width: 100%; } + +.order-first { + order: -1; } + +.order-last { + order: 13; } + +.order-0 { + order: 0; } + +.order-1 { + order: 1; } + +.order-2 { + order: 2; } + +.order-3 { + order: 3; } + +.order-4 { + order: 4; } + +.order-5 { + order: 5; } + +.order-6 { + order: 6; } + +.order-7 { + order: 7; } + +.order-8 { + order: 8; } + +.order-9 { + order: 9; } + +.order-10 { + order: 10; } + +.order-11 { + order: 11; } + +.order-12 { + order: 12; } + +.offset-1 { + margin-left: 8.33333%; } + +.offset-2 { + margin-left: 16.66667%; } + +.offset-3 { + margin-left: 25%; } + +.offset-4 { + margin-left: 33.33333%; } + +.offset-5 { + margin-left: 41.66667%; } + +.offset-6 { + margin-left: 50%; } + +.offset-7 { + margin-left: 58.33333%; } + +.offset-8 { + margin-left: 66.66667%; } + +.offset-9 { + margin-left: 75%; } + +.offset-10 { + margin-left: 83.33333%; } + +.offset-11 { + margin-left: 91.66667%; } + +@media (min-width: 576px) { + .col-sm { + flex-basis: 0; + flex-grow: 1; + max-width: 100%; } + .col-sm-auto { + flex: 0 0 auto; + width: auto; + max-width: none; } + .col-sm-1 { + flex: 0 0 8.33333%; + max-width: 8.33333%; } + .col-sm-2 { + flex: 0 0 16.66667%; + max-width: 16.66667%; } + .col-sm-3 { + flex: 0 0 25%; + max-width: 25%; } + .col-sm-4 { + flex: 0 0 33.33333%; + max-width: 33.33333%; } + .col-sm-5 { + flex: 0 0 41.66667%; + max-width: 41.66667%; } + .col-sm-6 { + flex: 0 0 50%; + max-width: 50%; } + .col-sm-7 { + flex: 0 0 58.33333%; + max-width: 58.33333%; } + .col-sm-8 { + flex: 0 0 66.66667%; + max-width: 66.66667%; } + .col-sm-9 { + flex: 0 0 75%; + max-width: 75%; } + .col-sm-10 { + flex: 0 0 83.33333%; + max-width: 83.33333%; } + .col-sm-11 { + flex: 0 0 91.66667%; + max-width: 91.66667%; } + .col-sm-12 { + flex: 0 0 100%; + max-width: 100%; } + .order-sm-first { + order: -1; } + .order-sm-last { + order: 13; } + .order-sm-0 { + order: 0; } + .order-sm-1 { + order: 1; } + .order-sm-2 { + order: 2; } + .order-sm-3 { + order: 3; } + .order-sm-4 { + order: 4; } + .order-sm-5 { + order: 5; } + .order-sm-6 { + order: 6; } + .order-sm-7 { + order: 7; } + .order-sm-8 { + order: 8; } + .order-sm-9 { + order: 9; } + .order-sm-10 { + order: 10; } + .order-sm-11 { + order: 11; } + .order-sm-12 { + order: 12; } + .offset-sm-0 { + margin-left: 0; } + .offset-sm-1 { + margin-left: 8.33333%; } + .offset-sm-2 { + margin-left: 16.66667%; } + .offset-sm-3 { + margin-left: 25%; } + .offset-sm-4 { + margin-left: 33.33333%; } + .offset-sm-5 { + margin-left: 41.66667%; } + .offset-sm-6 { + margin-left: 50%; } + .offset-sm-7 { + margin-left: 58.33333%; } + .offset-sm-8 { + margin-left: 66.66667%; } + .offset-sm-9 { + margin-left: 75%; } + .offset-sm-10 { + margin-left: 83.33333%; } + .offset-sm-11 { + margin-left: 91.66667%; } } + +@media (min-width: 768px) { + .col-md { + flex-basis: 0; + flex-grow: 1; + max-width: 100%; } + .col-md-auto { + flex: 0 0 auto; + width: auto; + max-width: none; } + .col-md-1 { + flex: 0 0 8.33333%; + max-width: 8.33333%; } + .col-md-2 { + flex: 0 0 16.66667%; + max-width: 16.66667%; } + .col-md-3 { + flex: 0 0 25%; + max-width: 25%; } + .col-md-4 { + flex: 0 0 33.33333%; + max-width: 33.33333%; } + .col-md-5 { + flex: 0 0 41.66667%; + max-width: 41.66667%; } + .col-md-6 { + flex: 0 0 50%; + max-width: 50%; } + .col-md-7 { + flex: 0 0 58.33333%; + max-width: 58.33333%; } + .col-md-8 { + flex: 0 0 66.66667%; + max-width: 66.66667%; } + .col-md-9 { + flex: 0 0 75%; + max-width: 75%; } + .col-md-10 { + flex: 0 0 83.33333%; + max-width: 83.33333%; } + .col-md-11 { + flex: 0 0 91.66667%; + max-width: 91.66667%; } + .col-md-12 { + flex: 0 0 100%; + max-width: 100%; } + .order-md-first { + order: -1; } + .order-md-last { + order: 13; } + .order-md-0 { + order: 0; } + .order-md-1 { + order: 1; } + .order-md-2 { + order: 2; } + .order-md-3 { + order: 3; } + .order-md-4 { + order: 4; } + .order-md-5 { + order: 5; } + .order-md-6 { + order: 6; } + .order-md-7 { + order: 7; } + .order-md-8 { + order: 8; } + .order-md-9 { + order: 9; } + .order-md-10 { + order: 10; } + .order-md-11 { + order: 11; } + .order-md-12 { + order: 12; } + .offset-md-0 { + margin-left: 0; } + .offset-md-1 { + margin-left: 8.33333%; } + .offset-md-2 { + margin-left: 16.66667%; } + .offset-md-3 { + margin-left: 25%; } + .offset-md-4 { + margin-left: 33.33333%; } + .offset-md-5 { + margin-left: 41.66667%; } + .offset-md-6 { + margin-left: 50%; } + .offset-md-7 { + margin-left: 58.33333%; } + .offset-md-8 { + margin-left: 66.66667%; } + .offset-md-9 { + margin-left: 75%; } + .offset-md-10 { + margin-left: 83.33333%; } + .offset-md-11 { + margin-left: 91.66667%; } } + +@media (min-width: 992px) { + .col-lg { + flex-basis: 0; + flex-grow: 1; + max-width: 100%; } + .col-lg-auto { + flex: 0 0 auto; + width: auto; + max-width: none; } + .col-lg-1 { + flex: 0 0 8.33333%; + max-width: 8.33333%; } + .col-lg-2 { + flex: 0 0 16.66667%; + max-width: 16.66667%; } + .col-lg-3 { + flex: 0 0 25%; + max-width: 25%; } + .col-lg-4 { + flex: 0 0 33.33333%; + max-width: 33.33333%; } + .col-lg-5 { + flex: 0 0 41.66667%; + max-width: 41.66667%; } + .col-lg-6 { + flex: 0 0 50%; + max-width: 50%; } + .col-lg-7 { + flex: 0 0 58.33333%; + max-width: 58.33333%; } + .col-lg-8 { + flex: 0 0 66.66667%; + max-width: 66.66667%; } + .col-lg-9 { + flex: 0 0 75%; + max-width: 75%; } + .col-lg-10 { + flex: 0 0 83.33333%; + max-width: 83.33333%; } + .col-lg-11 { + flex: 0 0 91.66667%; + max-width: 91.66667%; } + .col-lg-12 { + flex: 0 0 100%; + max-width: 100%; } + .order-lg-first { + order: -1; } + .order-lg-last { + order: 13; } + .order-lg-0 { + order: 0; } + .order-lg-1 { + order: 1; } + .order-lg-2 { + order: 2; } + .order-lg-3 { + order: 3; } + .order-lg-4 { + order: 4; } + .order-lg-5 { + order: 5; } + .order-lg-6 { + order: 6; } + .order-lg-7 { + order: 7; } + .order-lg-8 { + order: 8; } + .order-lg-9 { + order: 9; } + .order-lg-10 { + order: 10; } + .order-lg-11 { + order: 11; } + .order-lg-12 { + order: 12; } + .offset-lg-0 { + margin-left: 0; } + .offset-lg-1 { + margin-left: 8.33333%; } + .offset-lg-2 { + margin-left: 16.66667%; } + .offset-lg-3 { + margin-left: 25%; } + .offset-lg-4 { + margin-left: 33.33333%; } + .offset-lg-5 { + margin-left: 41.66667%; } + .offset-lg-6 { + margin-left: 50%; } + .offset-lg-7 { + margin-left: 58.33333%; } + .offset-lg-8 { + margin-left: 66.66667%; } + .offset-lg-9 { + margin-left: 75%; } + .offset-lg-10 { + margin-left: 83.33333%; } + .offset-lg-11 { + margin-left: 91.66667%; } } + +@media (min-width: 1200px) { + .col-xl { + flex-basis: 0; + flex-grow: 1; + max-width: 100%; } + .col-xl-auto { + flex: 0 0 auto; + width: auto; + max-width: none; } + .col-xl-1 { + flex: 0 0 8.33333%; + max-width: 8.33333%; } + .col-xl-2 { + flex: 0 0 16.66667%; + max-width: 16.66667%; } + .col-xl-3 { + flex: 0 0 25%; + max-width: 25%; } + .col-xl-4 { + flex: 0 0 33.33333%; + max-width: 33.33333%; } + .col-xl-5 { + flex: 0 0 41.66667%; + max-width: 41.66667%; } + .col-xl-6 { + flex: 0 0 50%; + max-width: 50%; } + .col-xl-7 { + flex: 0 0 58.33333%; + max-width: 58.33333%; } + .col-xl-8 { + flex: 0 0 66.66667%; + max-width: 66.66667%; } + .col-xl-9 { + flex: 0 0 75%; + max-width: 75%; } + .col-xl-10 { + flex: 0 0 83.33333%; + max-width: 83.33333%; } + .col-xl-11 { + flex: 0 0 91.66667%; + max-width: 91.66667%; } + .col-xl-12 { + flex: 0 0 100%; + max-width: 100%; } + .order-xl-first { + order: -1; } + .order-xl-last { + order: 13; } + .order-xl-0 { + order: 0; } + .order-xl-1 { + order: 1; } + .order-xl-2 { + order: 2; } + .order-xl-3 { + order: 3; } + .order-xl-4 { + order: 4; } + .order-xl-5 { + order: 5; } + .order-xl-6 { + order: 6; } + .order-xl-7 { + order: 7; } + .order-xl-8 { + order: 8; } + .order-xl-9 { + order: 9; } + .order-xl-10 { + order: 10; } + .order-xl-11 { + order: 11; } + .order-xl-12 { + order: 12; } + .offset-xl-0 { + margin-left: 0; } + .offset-xl-1 { + margin-left: 8.33333%; } + .offset-xl-2 { + margin-left: 16.66667%; } + .offset-xl-3 { + margin-left: 25%; } + .offset-xl-4 { + margin-left: 33.33333%; } + .offset-xl-5 { + margin-left: 41.66667%; } + .offset-xl-6 { + margin-left: 50%; } + .offset-xl-7 { + margin-left: 58.33333%; } + .offset-xl-8 { + margin-left: 66.66667%; } + .offset-xl-9 { + margin-left: 75%; } + .offset-xl-10 { + margin-left: 83.33333%; } + .offset-xl-11 { + margin-left: 91.66667%; } } + +.table { + width: 100%; + max-width: 100%; + margin-bottom: 1.5rem; + background-color: transparent; } + .table th, + .table td { + padding: 0.75rem; + vertical-align: top; + border-top: 1px solid #dee2e6; } + .table thead th { + vertical-align: bottom; + border-bottom: 2px solid #dee2e6; } + .table tbody + tbody { + border-top: 2px solid #dee2e6; } + .table .table { + background-color: white; } + +.table-sm th, +.table-sm td { + padding: 0.3rem; } + +.table-bordered { + border: 1px solid #dee2e6; } + .table-bordered th, + .table-bordered td { + border: 1px solid #dee2e6; } + .table-bordered thead th, + .table-bordered thead td { + border-bottom-width: 2px; } + +.table-striped tbody tr:nth-of-type(odd) { + background-color: rgba(0, 0, 0, 0.05); } + +.table-hover tbody tr:hover { + background-color: rgba(0, 0, 0, 0.075); } + +.table-primary, +.table-primary > th, +.table-primary > td { + background-color: #c0c6c6; } + +.table-hover .table-primary:hover { + background-color: #b3baba; } + .table-hover .table-primary:hover > td, + .table-hover .table-primary:hover > th { + background-color: #b3baba; } + +.table-secondary, +.table-secondary > th, +.table-secondary > td { + background-color: #bfdbda; } + +.table-hover .table-secondary:hover { + background-color: #afd2d1; } + .table-hover .table-secondary:hover > td, + .table-hover .table-secondary:hover > th { + background-color: #afd2d1; } + +.table-success, +.table-success > th, +.table-success > td { + background-color: #c3e6cb; } + +.table-hover .table-success:hover { + background-color: #b1dfbb; } + .table-hover .table-success:hover > td, + .table-hover .table-success:hover > th { + background-color: #b1dfbb; } + +.table-info, +.table-info > th, +.table-info > td { + background-color: #f1f1f1; } + +.table-hover .table-info:hover { + background-color: #e4e4e4; } + .table-hover .table-info:hover > td, + .table-hover .table-info:hover > th { + background-color: #e4e4e4; } + +.table-warning, +.table-warning > th, +.table-warning > td { + background-color: #ffeeba; } + +.table-hover .table-warning:hover { + background-color: #ffe8a1; } + .table-hover .table-warning:hover > td, + .table-hover .table-warning:hover > th { + background-color: #ffe8a1; } + +.table-danger, +.table-danger > th, +.table-danger > td { + background-color: #f5c6cb; } + +.table-hover .table-danger:hover { + background-color: #f1b0b7; } + .table-hover .table-danger:hover > td, + .table-hover .table-danger:hover > th { + background-color: #f1b0b7; } + +.table-light, +.table-light > th, +.table-light > td { + background-color: #fcfcfc; } + +.table-hover .table-light:hover { + background-color: #efefef; } + .table-hover .table-light:hover > td, + .table-hover .table-light:hover > th { + background-color: #efefef; } + +.table-dark, +.table-dark > th, +.table-dark > td { + background-color: #bdbdbd; } + +.table-hover .table-dark:hover { + background-color: #b0b0b0; } + .table-hover .table-dark:hover > td, + .table-hover .table-dark:hover > th { + background-color: #b0b0b0; } + +.table-active, +.table-active > th, +.table-active > td { + background-color: rgba(0, 0, 0, 0.075); } + +.table-hover .table-active:hover { + background-color: rgba(0, 0, 0, 0.075); } + .table-hover .table-active:hover > td, + .table-hover .table-active:hover > th { + background-color: rgba(0, 0, 0, 0.075); } + +.table .thead-dark th { + color: white; + background-color: #212529; + border-color: #32383e; } + +.table .thead-light th { + color: #495057; + background-color: #e9ecef; + border-color: #dee2e6; } + +.table-dark { + color: white; + background-color: #212529; } + .table-dark th, + .table-dark td, + .table-dark thead th { + border-color: #32383e; } + .table-dark.table-bordered { + border: 0; } + .table-dark.table-striped tbody tr:nth-of-type(odd) { + background-color: rgba(255, 255, 255, 0.05); } + .table-dark.table-hover tbody tr:hover { + background-color: rgba(255, 255, 255, 0.075); } + +@media (max-width: 575.98px) { + .table-responsive-sm { + display: block; + width: 100%; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + -ms-overflow-style: -ms-autohiding-scrollbar; } + .table-responsive-sm > .table-bordered { + border: 0; } } + +@media (max-width: 767.98px) { + .table-responsive-md { + display: block; + width: 100%; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + -ms-overflow-style: -ms-autohiding-scrollbar; } + .table-responsive-md > .table-bordered { + border: 0; } } + +@media (max-width: 991.98px) { + .table-responsive-lg { + display: block; + width: 100%; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + -ms-overflow-style: -ms-autohiding-scrollbar; } + .table-responsive-lg > .table-bordered { + border: 0; } } + +@media (max-width: 1199.98px) { + .table-responsive-xl { + display: block; + width: 100%; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + -ms-overflow-style: -ms-autohiding-scrollbar; } + .table-responsive-xl > .table-bordered { + border: 0; } } + +.table-responsive { + display: block; + width: 100%; + overflow-x: auto; + -webkit-overflow-scrolling: touch; + -ms-overflow-style: -ms-autohiding-scrollbar; } + .table-responsive > .table-bordered { + border: 0; } + +.form-control { + display: block; + width: 100%; + padding: 0.375rem 0.75rem; + font-size: 1rem; + line-height: 1.5; + color: #495057; + background-color: #fff; + background-clip: padding-box; + border: 1px solid #ced4da; + border-radius: 0.25rem; + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + transition: border-color 0.15s ease-in-out, box-shadow 0.15s ease-in-out; } + .form-control::-ms-expand { + background-color: transparent; + border: 0; } + .form-control:focus { + color: #495057; + background-color: #fff; + border-color: #4f8282; + outline: 0; + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(31, 51, 51, 0.25); } + .form-control::-webkit-input-placeholder { + color: #6c757d; + opacity: 1; } + .form-control:-ms-input-placeholder { + color: #6c757d; + opacity: 1; } + .form-control::-ms-input-placeholder { + color: #6c757d; + opacity: 1; } + .form-control::placeholder { + color: #6c757d; + opacity: 1; } + .form-control:disabled, .form-control[readonly] { + background-color: #e9ecef; + opacity: 1; } + +select.form-control:not([size]):not([multiple]) { + height: calc(2.25rem + 2px); } + +select.form-control:focus::-ms-value { + color: #495057; + background-color: #fff; } + +.form-control-file, +.form-control-range { + display: block; + width: 100%; } + +.col-form-label { + padding-top: calc(0.375rem + 1px); + padding-bottom: calc(0.375rem + 1px); + margin-bottom: 0; + font-size: inherit; + line-height: 1.5; } + +.col-form-label-lg { + padding-top: calc(0.5rem + 1px); + padding-bottom: calc(0.5rem + 1px); + font-size: 1.25rem; + line-height: 1.5; } + +.col-form-label-sm { + padding-top: calc(0.25rem + 1px); + padding-bottom: calc(0.25rem + 1px); + font-size: 0.875rem; + line-height: 1.5; } + +.form-control-plaintext { + display: block; + width: 100%; + padding-top: 0.375rem; + padding-bottom: 0.375rem; + margin-bottom: 0; + line-height: 1.5; + background-color: transparent; + border: solid transparent; + border-width: 1px 0; } + .form-control-plaintext.form-control-sm, .input-group-sm > .form-control-plaintext.form-control, + .input-group-sm > .input-group-prepend > .form-control-plaintext.input-group-text, + .input-group-sm > .input-group-append > .form-control-plaintext.input-group-text, + .input-group-sm > .input-group-prepend > .form-control-plaintext.btn, + .input-group-sm > .input-group-append > .form-control-plaintext.btn, .form-control-plaintext.form-control-lg, .input-group-lg > .form-control-plaintext.form-control, + .input-group-lg > .input-group-prepend > .form-control-plaintext.input-group-text, + .input-group-lg > .input-group-append > .form-control-plaintext.input-group-text, + .input-group-lg > .input-group-prepend > .form-control-plaintext.btn, + .input-group-lg > .input-group-append > .form-control-plaintext.btn { + padding-right: 0; + padding-left: 0; } + +.form-control-sm, .input-group-sm > .form-control, +.input-group-sm > .input-group-prepend > .input-group-text, +.input-group-sm > .input-group-append > .input-group-text, +.input-group-sm > .input-group-prepend > .btn, +.input-group-sm > .input-group-append > .btn { + padding: 0.25rem 0.5rem; + font-size: 0.875rem; + line-height: 1.5; + border-radius: 0.2rem; } + +select.form-control-sm:not([size]):not([multiple]), .input-group-sm > select.form-control:not([size]):not([multiple]), +.input-group-sm > .input-group-prepend > select.input-group-text:not([size]):not([multiple]), +.input-group-sm > .input-group-append > select.input-group-text:not([size]):not([multiple]), +.input-group-sm > .input-group-prepend > select.btn:not([size]):not([multiple]), +.input-group-sm > .input-group-append > select.btn:not([size]):not([multiple]) { + height: calc(1.8125rem + 2px); } + +.form-control-lg, .input-group-lg > .form-control, +.input-group-lg > .input-group-prepend > .input-group-text, +.input-group-lg > .input-group-append > .input-group-text, +.input-group-lg > .input-group-prepend > .btn, +.input-group-lg > .input-group-append > .btn { + padding: 0.5rem 1rem; + font-size: 1.25rem; + line-height: 1.5; + border-radius: 0.3rem; } + +select.form-control-lg:not([size]):not([multiple]), .input-group-lg > select.form-control:not([size]):not([multiple]), +.input-group-lg > .input-group-prepend > select.input-group-text:not([size]):not([multiple]), +.input-group-lg > .input-group-append > select.input-group-text:not([size]):not([multiple]), +.input-group-lg > .input-group-prepend > select.btn:not([size]):not([multiple]), +.input-group-lg > .input-group-append > select.btn:not([size]):not([multiple]) { + height: calc(2.875rem + 2px); } + +.form-group { + margin-bottom: 1rem; } + +.form-text { + display: block; + margin-top: 0.25rem; } + +.form-row { + display: flex; + flex-wrap: wrap; + margin-right: -5px; + margin-left: -5px; } + .form-row > .col, + .form-row > [class*="col-"] { + padding-right: 5px; + padding-left: 5px; } + +.form-check { + position: relative; + display: block; + padding-left: 1.25rem; } + +.form-check-input { + position: absolute; + margin-top: 0.3rem; + margin-left: -1.25rem; } + .form-check-input:disabled ~ .form-check-label { + color: #6c757d; } + +.form-check-label { + margin-bottom: 0; } + +.form-check-inline { + display: inline-flex; + align-items: center; + padding-left: 0; + margin-right: 0.75rem; } + .form-check-inline .form-check-input { + position: static; + margin-top: 0; + margin-right: 0.3125rem; + margin-left: 0; } + +.valid-feedback { + display: none; + width: 100%; + margin-top: 0.25rem; + font-size: 80%; + color: #28a745; } + +.valid-tooltip { + position: absolute; + top: 100%; + z-index: 5; + display: none; + max-width: 100%; + padding: .5rem; + margin-top: .1rem; + font-size: .875rem; + line-height: 1; + color: #fff; + background-color: rgba(40, 167, 69, 0.8); + border-radius: .2rem; } + +.was-validated .form-control:valid, .form-control.is-valid, .was-validated +.custom-select:valid, +.custom-select.is-valid { + border-color: #28a745; } + .was-validated .form-control:valid:focus, .form-control.is-valid:focus, .was-validated + .custom-select:valid:focus, + .custom-select.is-valid:focus { + border-color: #28a745; + box-shadow: 0 0 0 0.2rem rgba(40, 167, 69, 0.25); } + .was-validated .form-control:valid ~ .valid-feedback, + .was-validated .form-control:valid ~ .valid-tooltip, .form-control.is-valid ~ .valid-feedback, + .form-control.is-valid ~ .valid-tooltip, .was-validated + .custom-select:valid ~ .valid-feedback, + .was-validated + .custom-select:valid ~ .valid-tooltip, + .custom-select.is-valid ~ .valid-feedback, + .custom-select.is-valid ~ .valid-tooltip { + display: block; } + +.was-validated .form-check-input:valid ~ .form-check-label, .form-check-input.is-valid ~ .form-check-label { + color: #28a745; } + +.was-validated .form-check-input:valid ~ .valid-feedback, +.was-validated .form-check-input:valid ~ .valid-tooltip, .form-check-input.is-valid ~ .valid-feedback, +.form-check-input.is-valid ~ .valid-tooltip { + display: block; } + +.was-validated .custom-control-input:valid ~ .custom-control-label, .custom-control-input.is-valid ~ .custom-control-label { + color: #28a745; } + .was-validated .custom-control-input:valid ~ .custom-control-label::before, .custom-control-input.is-valid ~ .custom-control-label::before { + background-color: #71dd8a; } + +.was-validated .custom-control-input:valid ~ .valid-feedback, +.was-validated .custom-control-input:valid ~ .valid-tooltip, .custom-control-input.is-valid ~ .valid-feedback, +.custom-control-input.is-valid ~ .valid-tooltip { + display: block; } + +.was-validated .custom-control-input:valid:checked ~ .custom-control-label::before, .custom-control-input.is-valid:checked ~ .custom-control-label::before { + background-color: #34ce57; } + +.was-validated .custom-control-input:valid:focus ~ .custom-control-label::before, .custom-control-input.is-valid:focus ~ .custom-control-label::before { + box-shadow: 0 0 0 1px white, 0 0 0 0.2rem rgba(40, 167, 69, 0.25); } + +.was-validated .custom-file-input:valid ~ .custom-file-label, .custom-file-input.is-valid ~ .custom-file-label { + border-color: #28a745; } + .was-validated .custom-file-input:valid ~ .custom-file-label::before, .custom-file-input.is-valid ~ .custom-file-label::before { + border-color: inherit; } + +.was-validated .custom-file-input:valid ~ .valid-feedback, +.was-validated .custom-file-input:valid ~ .valid-tooltip, .custom-file-input.is-valid ~ .valid-feedback, +.custom-file-input.is-valid ~ .valid-tooltip { + display: block; } + +.was-validated .custom-file-input:valid:focus ~ .custom-file-label, .custom-file-input.is-valid:focus ~ .custom-file-label { + box-shadow: 0 0 0 0.2rem rgba(40, 167, 69, 0.25); } + +.invalid-feedback { + display: none; + width: 100%; + margin-top: 0.25rem; + font-size: 80%; + color: #dc3545; } + +.invalid-tooltip { + position: absolute; + top: 100%; + z-index: 5; + display: none; + max-width: 100%; + padding: .5rem; + margin-top: .1rem; + font-size: .875rem; + line-height: 1; + color: #fff; + background-color: rgba(220, 53, 69, 0.8); + border-radius: .2rem; } + +.was-validated .form-control:invalid, .form-control.is-invalid, .was-validated +.custom-select:invalid, +.custom-select.is-invalid { + border-color: #dc3545; } + .was-validated .form-control:invalid:focus, .form-control.is-invalid:focus, .was-validated + .custom-select:invalid:focus, + .custom-select.is-invalid:focus { + border-color: #dc3545; + box-shadow: 0 0 0 0.2rem rgba(220, 53, 69, 0.25); } + .was-validated .form-control:invalid ~ .invalid-feedback, + .was-validated .form-control:invalid ~ .invalid-tooltip, .form-control.is-invalid ~ .invalid-feedback, + .form-control.is-invalid ~ .invalid-tooltip, .was-validated + .custom-select:invalid ~ .invalid-feedback, + .was-validated + .custom-select:invalid ~ .invalid-tooltip, + .custom-select.is-invalid ~ .invalid-feedback, + .custom-select.is-invalid ~ .invalid-tooltip { + display: block; } + +.was-validated .form-check-input:invalid ~ .form-check-label, .form-check-input.is-invalid ~ .form-check-label { + color: #dc3545; } + +.was-validated .form-check-input:invalid ~ .invalid-feedback, +.was-validated .form-check-input:invalid ~ .invalid-tooltip, .form-check-input.is-invalid ~ .invalid-feedback, +.form-check-input.is-invalid ~ .invalid-tooltip { + display: block; } + +.was-validated .custom-control-input:invalid ~ .custom-control-label, .custom-control-input.is-invalid ~ .custom-control-label { + color: #dc3545; } + .was-validated .custom-control-input:invalid ~ .custom-control-label::before, .custom-control-input.is-invalid ~ .custom-control-label::before { + background-color: #efa2a9; } + +.was-validated .custom-control-input:invalid ~ .invalid-feedback, +.was-validated .custom-control-input:invalid ~ .invalid-tooltip, .custom-control-input.is-invalid ~ .invalid-feedback, +.custom-control-input.is-invalid ~ .invalid-tooltip { + display: block; } + +.was-validated .custom-control-input:invalid:checked ~ .custom-control-label::before, .custom-control-input.is-invalid:checked ~ .custom-control-label::before { + background-color: #e4606d; } + +.was-validated .custom-control-input:invalid:focus ~ .custom-control-label::before, .custom-control-input.is-invalid:focus ~ .custom-control-label::before { + box-shadow: 0 0 0 1px white, 0 0 0 0.2rem rgba(220, 53, 69, 0.25); } + +.was-validated .custom-file-input:invalid ~ .custom-file-label, .custom-file-input.is-invalid ~ .custom-file-label { + border-color: #dc3545; } + .was-validated .custom-file-input:invalid ~ .custom-file-label::before, .custom-file-input.is-invalid ~ .custom-file-label::before { + border-color: inherit; } + +.was-validated .custom-file-input:invalid ~ .invalid-feedback, +.was-validated .custom-file-input:invalid ~ .invalid-tooltip, .custom-file-input.is-invalid ~ .invalid-feedback, +.custom-file-input.is-invalid ~ .invalid-tooltip { + display: block; } + +.was-validated .custom-file-input:invalid:focus ~ .custom-file-label, .custom-file-input.is-invalid:focus ~ .custom-file-label { + box-shadow: 0 0 0 0.2rem rgba(220, 53, 69, 0.25); } + +.form-inline { + display: flex; + flex-flow: row wrap; + align-items: center; } + .form-inline .form-check { + width: 100%; } + @media (min-width: 576px) { + .form-inline label { + display: flex; + align-items: center; + justify-content: center; + margin-bottom: 0; } + .form-inline .form-group { + display: flex; + flex: 0 0 auto; + flex-flow: row wrap; + align-items: center; + margin-bottom: 0; } + .form-inline .form-control { + display: inline-block; + width: auto; + vertical-align: middle; } + .form-inline .form-control-plaintext { + display: inline-block; } + .form-inline .input-group { + width: auto; } + .form-inline .form-check { + display: flex; + align-items: center; + justify-content: center; + width: auto; + padding-left: 0; } + .form-inline .form-check-input { + position: relative; + margin-top: 0; + margin-right: 0.25rem; + margin-left: 0; } + .form-inline .custom-control { + align-items: center; + justify-content: center; } + .form-inline .custom-control-label { + margin-bottom: 0; } } + +.btn { + display: inline-block; + font-weight: 200; + text-align: center; + white-space: nowrap; + vertical-align: middle; + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; + border: 1px solid transparent; + padding: 0.375rem 0.75rem; + font-size: 1rem; + line-height: 1.5; + border-radius: 0.25rem; + transition: color 0.15s ease-in-out, background-color 0.15s ease-in-out, border-color 0.15s ease-in-out, box-shadow 0.15s ease-in-out; } + .btn:hover, .btn:focus { + text-decoration: none; } + .btn:focus, .btn.focus { + outline: 0; + box-shadow: 0 0 0 0.2rem rgba(31, 51, 51, 0.25); } + .btn.disabled, .btn:disabled { + opacity: 0.65; + box-shadow: none; } + .btn:not(:disabled):not(.disabled) { + cursor: pointer; } + .btn:not(:disabled):not(.disabled):active, .btn:not(:disabled):not(.disabled).active { + background-image: none; + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125); } + .btn:not(:disabled):not(.disabled):active:focus, .btn:not(:disabled):not(.disabled).active:focus { + box-shadow: 0 0 0 0.2rem rgba(31, 51, 51, 0.25), inset 0 3px 5px rgba(0, 0, 0, 0.125); } + +a.btn.disabled, +fieldset:disabled a.btn { + pointer-events: none; } + +.btn-primary { + color: #fff; + background-color: #1f3333; + border-color: #1f3333; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-primary:hover { + color: #fff; + background-color: #111b1b; + border-color: #0c1313; } + .btn-primary:focus, .btn-primary.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(31, 51, 51, 0.5); } + .btn-primary.disabled, .btn-primary:disabled { + color: #fff; + background-color: #1f3333; + border-color: #1f3333; } + .btn-primary:not(:disabled):not(.disabled):active, .btn-primary:not(:disabled):not(.disabled).active, + .show > .btn-primary.dropdown-toggle { + color: #fff; + background-color: #0c1313; + border-color: #070b0b; } + .btn-primary:not(:disabled):not(.disabled):active:focus, .btn-primary:not(:disabled):not(.disabled).active:focus, + .show > .btn-primary.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(31, 51, 51, 0.5); } + +.btn-secondary { + color: #fff; + background-color: #1b807c; + border-color: #1b807c; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-secondary:hover { + color: #fff; + background-color: #14605d; + border-color: #125653; } + .btn-secondary:focus, .btn-secondary.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(27, 128, 124, 0.5); } + .btn-secondary.disabled, .btn-secondary:disabled { + color: #fff; + background-color: #1b807c; + border-color: #1b807c; } + .btn-secondary:not(:disabled):not(.disabled):active, .btn-secondary:not(:disabled):not(.disabled).active, + .show > .btn-secondary.dropdown-toggle { + color: #fff; + background-color: #125653; + border-color: #104b49; } + .btn-secondary:not(:disabled):not(.disabled):active:focus, .btn-secondary:not(:disabled):not(.disabled).active:focus, + .show > .btn-secondary.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(27, 128, 124, 0.5); } + +.btn-success { + color: #fff; + background-color: #28a745; + border-color: #28a745; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-success:hover { + color: #fff; + background-color: #218838; + border-color: #1e7e34; } + .btn-success:focus, .btn-success.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(40, 167, 69, 0.5); } + .btn-success.disabled, .btn-success:disabled { + color: #fff; + background-color: #28a745; + border-color: #28a745; } + .btn-success:not(:disabled):not(.disabled):active, .btn-success:not(:disabled):not(.disabled).active, + .show > .btn-success.dropdown-toggle { + color: #fff; + background-color: #1e7e34; + border-color: #1c7430; } + .btn-success:not(:disabled):not(.disabled):active:focus, .btn-success:not(:disabled):not(.disabled).active:focus, + .show > .btn-success.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(40, 167, 69, 0.5); } + +.btn-info { + color: #212529; + background-color: #ccc; + border-color: #ccc; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-info:hover { + color: #212529; + background-color: #b9b9b9; + border-color: #b3b3b3; } + .btn-info:focus, .btn-info.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(204, 204, 204, 0.5); } + .btn-info.disabled, .btn-info:disabled { + color: #212529; + background-color: #ccc; + border-color: #ccc; } + .btn-info:not(:disabled):not(.disabled):active, .btn-info:not(:disabled):not(.disabled).active, + .show > .btn-info.dropdown-toggle { + color: #212529; + background-color: #b3b3b3; + border-color: #acacac; } + .btn-info:not(:disabled):not(.disabled):active:focus, .btn-info:not(:disabled):not(.disabled).active:focus, + .show > .btn-info.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(204, 204, 204, 0.5); } + +.btn-warning { + color: #212529; + background-color: #ffc107; + border-color: #ffc107; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-warning:hover { + color: #212529; + background-color: #e0a800; + border-color: #d39e00; } + .btn-warning:focus, .btn-warning.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(255, 193, 7, 0.5); } + .btn-warning.disabled, .btn-warning:disabled { + color: #212529; + background-color: #ffc107; + border-color: #ffc107; } + .btn-warning:not(:disabled):not(.disabled):active, .btn-warning:not(:disabled):not(.disabled).active, + .show > .btn-warning.dropdown-toggle { + color: #212529; + background-color: #d39e00; + border-color: #c69500; } + .btn-warning:not(:disabled):not(.disabled):active:focus, .btn-warning:not(:disabled):not(.disabled).active:focus, + .show > .btn-warning.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(255, 193, 7, 0.5); } + +.btn-danger { + color: #fff; + background-color: #dc3545; + border-color: #dc3545; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-danger:hover { + color: #fff; + background-color: #c82333; + border-color: #bd2130; } + .btn-danger:focus, .btn-danger.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(220, 53, 69, 0.5); } + .btn-danger.disabled, .btn-danger:disabled { + color: #fff; + background-color: #dc3545; + border-color: #dc3545; } + .btn-danger:not(:disabled):not(.disabled):active, .btn-danger:not(:disabled):not(.disabled).active, + .show > .btn-danger.dropdown-toggle { + color: #fff; + background-color: #bd2130; + border-color: #b21f2d; } + .btn-danger:not(:disabled):not(.disabled):active:focus, .btn-danger:not(:disabled):not(.disabled).active:focus, + .show > .btn-danger.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(220, 53, 69, 0.5); } + +.btn-light { + color: #212529; + background-color: #f3f3f3; + border-color: #f3f3f3; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-light:hover { + color: #212529; + background-color: #e0e0e0; + border-color: #dadada; } + .btn-light:focus, .btn-light.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(243, 243, 243, 0.5); } + .btn-light.disabled, .btn-light:disabled { + color: #212529; + background-color: #f3f3f3; + border-color: #f3f3f3; } + .btn-light:not(:disabled):not(.disabled):active, .btn-light:not(:disabled):not(.disabled).active, + .show > .btn-light.dropdown-toggle { + color: #212529; + background-color: #dadada; + border-color: lightgray; } + .btn-light:not(:disabled):not(.disabled):active:focus, .btn-light:not(:disabled):not(.disabled).active:focus, + .show > .btn-light.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(243, 243, 243, 0.5); } + +.btn-dark { + color: #fff; + background-color: #151515; + border-color: #151515; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075); } + .btn-dark:hover { + color: #fff; + background-color: #020202; + border-color: black; } + .btn-dark:focus, .btn-dark.focus { + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.15), 0 1px 1px rgba(0, 0, 0, 0.075), 0 0 0 0.2rem rgba(21, 21, 21, 0.5); } + .btn-dark.disabled, .btn-dark:disabled { + color: #fff; + background-color: #151515; + border-color: #151515; } + .btn-dark:not(:disabled):not(.disabled):active, .btn-dark:not(:disabled):not(.disabled).active, + .show > .btn-dark.dropdown-toggle { + color: #fff; + background-color: black; + border-color: black; } + .btn-dark:not(:disabled):not(.disabled):active:focus, .btn-dark:not(:disabled):not(.disabled).active:focus, + .show > .btn-dark.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(21, 21, 21, 0.5); } + +.btn-outline-primary { + color: #1f3333; + background-color: transparent; + background-image: none; + border-color: #1f3333; } + .btn-outline-primary:hover { + color: #fff; + background-color: #1f3333; + border-color: #1f3333; } + .btn-outline-primary:focus, .btn-outline-primary.focus { + box-shadow: 0 0 0 0.2rem rgba(31, 51, 51, 0.5); } + .btn-outline-primary.disabled, .btn-outline-primary:disabled { + color: #1f3333; + background-color: transparent; } + .btn-outline-primary:not(:disabled):not(.disabled):active, .btn-outline-primary:not(:disabled):not(.disabled).active, + .show > .btn-outline-primary.dropdown-toggle { + color: #fff; + background-color: #1f3333; + border-color: #1f3333; } + .btn-outline-primary:not(:disabled):not(.disabled):active:focus, .btn-outline-primary:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-primary.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(31, 51, 51, 0.5); } + +.btn-outline-secondary { + color: #1b807c; + background-color: transparent; + background-image: none; + border-color: #1b807c; } + .btn-outline-secondary:hover { + color: #fff; + background-color: #1b807c; + border-color: #1b807c; } + .btn-outline-secondary:focus, .btn-outline-secondary.focus { + box-shadow: 0 0 0 0.2rem rgba(27, 128, 124, 0.5); } + .btn-outline-secondary.disabled, .btn-outline-secondary:disabled { + color: #1b807c; + background-color: transparent; } + .btn-outline-secondary:not(:disabled):not(.disabled):active, .btn-outline-secondary:not(:disabled):not(.disabled).active, + .show > .btn-outline-secondary.dropdown-toggle { + color: #fff; + background-color: #1b807c; + border-color: #1b807c; } + .btn-outline-secondary:not(:disabled):not(.disabled):active:focus, .btn-outline-secondary:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-secondary.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(27, 128, 124, 0.5); } + +.btn-outline-success { + color: #28a745; + background-color: transparent; + background-image: none; + border-color: #28a745; } + .btn-outline-success:hover { + color: #fff; + background-color: #28a745; + border-color: #28a745; } + .btn-outline-success:focus, .btn-outline-success.focus { + box-shadow: 0 0 0 0.2rem rgba(40, 167, 69, 0.5); } + .btn-outline-success.disabled, .btn-outline-success:disabled { + color: #28a745; + background-color: transparent; } + .btn-outline-success:not(:disabled):not(.disabled):active, .btn-outline-success:not(:disabled):not(.disabled).active, + .show > .btn-outline-success.dropdown-toggle { + color: #fff; + background-color: #28a745; + border-color: #28a745; } + .btn-outline-success:not(:disabled):not(.disabled):active:focus, .btn-outline-success:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-success.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(40, 167, 69, 0.5); } + +.btn-outline-info { + color: #ccc; + background-color: transparent; + background-image: none; + border-color: #ccc; } + .btn-outline-info:hover { + color: #212529; + background-color: #ccc; + border-color: #ccc; } + .btn-outline-info:focus, .btn-outline-info.focus { + box-shadow: 0 0 0 0.2rem rgba(204, 204, 204, 0.5); } + .btn-outline-info.disabled, .btn-outline-info:disabled { + color: #ccc; + background-color: transparent; } + .btn-outline-info:not(:disabled):not(.disabled):active, .btn-outline-info:not(:disabled):not(.disabled).active, + .show > .btn-outline-info.dropdown-toggle { + color: #212529; + background-color: #ccc; + border-color: #ccc; } + .btn-outline-info:not(:disabled):not(.disabled):active:focus, .btn-outline-info:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-info.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(204, 204, 204, 0.5); } + +.btn-outline-warning { + color: #ffc107; + background-color: transparent; + background-image: none; + border-color: #ffc107; } + .btn-outline-warning:hover { + color: #212529; + background-color: #ffc107; + border-color: #ffc107; } + .btn-outline-warning:focus, .btn-outline-warning.focus { + box-shadow: 0 0 0 0.2rem rgba(255, 193, 7, 0.5); } + .btn-outline-warning.disabled, .btn-outline-warning:disabled { + color: #ffc107; + background-color: transparent; } + .btn-outline-warning:not(:disabled):not(.disabled):active, .btn-outline-warning:not(:disabled):not(.disabled).active, + .show > .btn-outline-warning.dropdown-toggle { + color: #212529; + background-color: #ffc107; + border-color: #ffc107; } + .btn-outline-warning:not(:disabled):not(.disabled):active:focus, .btn-outline-warning:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-warning.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(255, 193, 7, 0.5); } + +.btn-outline-danger { + color: #dc3545; + background-color: transparent; + background-image: none; + border-color: #dc3545; } + .btn-outline-danger:hover { + color: #fff; + background-color: #dc3545; + border-color: #dc3545; } + .btn-outline-danger:focus, .btn-outline-danger.focus { + box-shadow: 0 0 0 0.2rem rgba(220, 53, 69, 0.5); } + .btn-outline-danger.disabled, .btn-outline-danger:disabled { + color: #dc3545; + background-color: transparent; } + .btn-outline-danger:not(:disabled):not(.disabled):active, .btn-outline-danger:not(:disabled):not(.disabled).active, + .show > .btn-outline-danger.dropdown-toggle { + color: #fff; + background-color: #dc3545; + border-color: #dc3545; } + .btn-outline-danger:not(:disabled):not(.disabled):active:focus, .btn-outline-danger:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-danger.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(220, 53, 69, 0.5); } + +.btn-outline-light { + color: #f3f3f3; + background-color: transparent; + background-image: none; + border-color: #f3f3f3; } + .btn-outline-light:hover { + color: #212529; + background-color: #f3f3f3; + border-color: #f3f3f3; } + .btn-outline-light:focus, .btn-outline-light.focus { + box-shadow: 0 0 0 0.2rem rgba(243, 243, 243, 0.5); } + .btn-outline-light.disabled, .btn-outline-light:disabled { + color: #f3f3f3; + background-color: transparent; } + .btn-outline-light:not(:disabled):not(.disabled):active, .btn-outline-light:not(:disabled):not(.disabled).active, + .show > .btn-outline-light.dropdown-toggle { + color: #212529; + background-color: #f3f3f3; + border-color: #f3f3f3; } + .btn-outline-light:not(:disabled):not(.disabled):active:focus, .btn-outline-light:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-light.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(243, 243, 243, 0.5); } + +.btn-outline-dark { + color: #151515; + background-color: transparent; + background-image: none; + border-color: #151515; } + .btn-outline-dark:hover { + color: #fff; + background-color: #151515; + border-color: #151515; } + .btn-outline-dark:focus, .btn-outline-dark.focus { + box-shadow: 0 0 0 0.2rem rgba(21, 21, 21, 0.5); } + .btn-outline-dark.disabled, .btn-outline-dark:disabled { + color: #151515; + background-color: transparent; } + .btn-outline-dark:not(:disabled):not(.disabled):active, .btn-outline-dark:not(:disabled):not(.disabled).active, + .show > .btn-outline-dark.dropdown-toggle { + color: #fff; + background-color: #151515; + border-color: #151515; } + .btn-outline-dark:not(:disabled):not(.disabled):active:focus, .btn-outline-dark:not(:disabled):not(.disabled).active:focus, + .show > .btn-outline-dark.dropdown-toggle:focus { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125), 0 0 0 0.2rem rgba(21, 21, 21, 0.5); } + +.btn-link { + font-weight: 200; + color: #1b807c; + background-color: transparent; } + .btn-link:hover { + color: #0e413f; + text-decoration: underline; + background-color: transparent; + border-color: transparent; } + .btn-link:focus, .btn-link.focus { + text-decoration: underline; + border-color: transparent; + box-shadow: none; } + .btn-link:disabled, .btn-link.disabled { + color: #6c757d; } + +.btn-lg, .btn-group-lg > .btn { + padding: 0.5rem 1rem; + font-size: 1.25rem; + line-height: 1.5; + border-radius: 0.3rem; } + +.btn-sm, .btn-group-sm > .btn { + padding: 0.25rem 0.5rem; + font-size: 0.875rem; + line-height: 1.5; + border-radius: 0.2rem; } + +.btn-block { + display: block; + width: 100%; } + .btn-block + .btn-block { + margin-top: 0.5rem; } + +input[type="submit"].btn-block, +input[type="reset"].btn-block, +input[type="button"].btn-block { + width: 100%; } + +.fade { + opacity: 0; + transition: opacity 0.15s linear; } + .fade.show { + opacity: 1; } + +.collapse { + display: none; } + .collapse.show { + display: block; } + +tr.collapse.show { + display: table-row; } + +tbody.collapse.show { + display: table-row-group; } + +.collapsing { + position: relative; + height: 0; + overflow: hidden; + transition: height 0.35s ease; } + +.dropup, +.dropdown { + position: relative; } + +.dropdown-toggle::after { + display: inline-block; + width: 0; + height: 0; + margin-left: 0.255em; + vertical-align: 0.255em; + content: ""; + border-top: 0.3em solid; + border-right: 0.3em solid transparent; + border-bottom: 0; + border-left: 0.3em solid transparent; } + +.dropdown-toggle:empty::after { + margin-left: 0; } + +.dropdown-menu { + position: absolute; + top: 100%; + left: 0; + z-index: 1000; + display: none; + float: left; + min-width: 10rem; + padding: 0.5rem 0; + margin: 0.125rem 0 0; + font-size: 1rem; + color: #1f3333; + text-align: left; + list-style: none; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, 0.15); + border-radius: 0.25rem; + box-shadow: 0 0.5rem 1rem rgba(0, 0, 0, 0.175); } + +.dropup .dropdown-menu { + margin-top: 0; + margin-bottom: 0.125rem; } + +.dropup .dropdown-toggle::after { + display: inline-block; + width: 0; + height: 0; + margin-left: 0.255em; + vertical-align: 0.255em; + content: ""; + border-top: 0; + border-right: 0.3em solid transparent; + border-bottom: 0.3em solid; + border-left: 0.3em solid transparent; } + +.dropup .dropdown-toggle:empty::after { + margin-left: 0; } + +.dropright .dropdown-menu { + margin-top: 0; + margin-left: 0.125rem; } + +.dropright .dropdown-toggle::after { + display: inline-block; + width: 0; + height: 0; + margin-left: 0.255em; + vertical-align: 0.255em; + content: ""; + border-top: 0.3em solid transparent; + border-bottom: 0.3em solid transparent; + border-left: 0.3em solid; } + +.dropright .dropdown-toggle:empty::after { + margin-left: 0; } + +.dropright .dropdown-toggle::after { + vertical-align: 0; } + +.dropleft .dropdown-menu { + margin-top: 0; + margin-right: 0.125rem; } + +.dropleft .dropdown-toggle::after { + display: inline-block; + width: 0; + height: 0; + margin-left: 0.255em; + vertical-align: 0.255em; + content: ""; } + +.dropleft .dropdown-toggle::after { + display: none; } + +.dropleft .dropdown-toggle::before { + display: inline-block; + width: 0; + height: 0; + margin-right: 0.255em; + vertical-align: 0.255em; + content: ""; + border-top: 0.3em solid transparent; + border-right: 0.3em solid; + border-bottom: 0.3em solid transparent; } + +.dropleft .dropdown-toggle:empty::after { + margin-left: 0; } + +.dropleft .dropdown-toggle::before { + vertical-align: 0; } + +.dropdown-divider { + height: 0; + margin: 0.75rem 0; + overflow: hidden; + border-top: 1px solid #e9ecef; } + +.dropdown-item { + display: block; + width: 100%; + padding: 0.25rem 1.5rem; + clear: both; + font-weight: 200; + color: #212529; + text-align: inherit; + white-space: nowrap; + background-color: transparent; + border: 0; } + .dropdown-item:hover, .dropdown-item:focus { + color: #16181b; + text-decoration: none; + background-color: #f8f9fa; } + .dropdown-item.active, .dropdown-item:active { + color: #fff; + text-decoration: none; + background-color: #1f3333; } + .dropdown-item.disabled, .dropdown-item:disabled { + color: #6c757d; + background-color: transparent; } + +.dropdown-menu.show { + display: block; } + +.dropdown-header { + display: block; + padding: 0.5rem 1.5rem; + margin-bottom: 0; + font-size: 0.875rem; + color: #6c757d; + white-space: nowrap; } + +.btn-group, +.btn-group-vertical { + position: relative; + display: inline-flex; + vertical-align: middle; } + .btn-group > .btn, + .btn-group-vertical > .btn { + position: relative; + flex: 0 1 auto; } + .btn-group > .btn:hover, + .btn-group-vertical > .btn:hover { + z-index: 1; } + .btn-group > .btn:focus, .btn-group > .btn:active, .btn-group > .btn.active, + .btn-group-vertical > .btn:focus, + .btn-group-vertical > .btn:active, + .btn-group-vertical > .btn.active { + z-index: 1; } + .btn-group .btn + .btn, + .btn-group .btn + .btn-group, + .btn-group .btn-group + .btn, + .btn-group .btn-group + .btn-group, + .btn-group-vertical .btn + .btn, + .btn-group-vertical .btn + .btn-group, + .btn-group-vertical .btn-group + .btn, + .btn-group-vertical .btn-group + .btn-group { + margin-left: -1px; } + +.btn-toolbar { + display: flex; + flex-wrap: wrap; + justify-content: flex-start; } + .btn-toolbar .input-group { + width: auto; } + +.btn-group > .btn:first-child { + margin-left: 0; } + +.btn-group > .btn:not(:last-child):not(.dropdown-toggle), +.btn-group > .btn-group:not(:last-child) > .btn { + border-top-right-radius: 0; + border-bottom-right-radius: 0; } + +.btn-group > .btn:not(:first-child), +.btn-group > .btn-group:not(:first-child) > .btn { + border-top-left-radius: 0; + border-bottom-left-radius: 0; } + +.dropdown-toggle-split { + padding-right: 0.5625rem; + padding-left: 0.5625rem; } + .dropdown-toggle-split::after { + margin-left: 0; } + +.btn-sm + .dropdown-toggle-split, .btn-group-sm > .btn + .dropdown-toggle-split { + padding-right: 0.375rem; + padding-left: 0.375rem; } + +.btn-lg + .dropdown-toggle-split, .btn-group-lg > .btn + .dropdown-toggle-split { + padding-right: 0.75rem; + padding-left: 0.75rem; } + +.btn-group.show .dropdown-toggle { + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125); } + .btn-group.show .dropdown-toggle.btn-link { + box-shadow: none; } + +.btn-group-vertical { + flex-direction: column; + align-items: flex-start; + justify-content: center; } + .btn-group-vertical .btn, + .btn-group-vertical .btn-group { + width: 100%; } + .btn-group-vertical > .btn + .btn, + .btn-group-vertical > .btn + .btn-group, + .btn-group-vertical > .btn-group + .btn, + .btn-group-vertical > .btn-group + .btn-group { + margin-top: -1px; + margin-left: 0; } + .btn-group-vertical > .btn:not(:last-child):not(.dropdown-toggle), + .btn-group-vertical > .btn-group:not(:last-child) > .btn { + border-bottom-right-radius: 0; + border-bottom-left-radius: 0; } + .btn-group-vertical > .btn:not(:first-child), + .btn-group-vertical > .btn-group:not(:first-child) > .btn { + border-top-left-radius: 0; + border-top-right-radius: 0; } + +.btn-group-toggle > .btn, +.btn-group-toggle > .btn-group > .btn { + margin-bottom: 0; } + .btn-group-toggle > .btn input[type="radio"], + .btn-group-toggle > .btn input[type="checkbox"], + .btn-group-toggle > .btn-group > .btn input[type="radio"], + .btn-group-toggle > .btn-group > .btn input[type="checkbox"] { + position: absolute; + clip: rect(0, 0, 0, 0); + pointer-events: none; } + +.input-group { + position: relative; + display: flex; + flex-wrap: wrap; + align-items: stretch; + width: 100%; } + .input-group > .form-control, + .input-group > .custom-select, + .input-group > .custom-file { + position: relative; + flex: 1 1 auto; + width: 1%; + margin-bottom: 0; } + .input-group > .form-control:focus, + .input-group > .custom-select:focus, + .input-group > .custom-file:focus { + z-index: 3; } + .input-group > .form-control + .form-control, + .input-group > .form-control + .custom-select, + .input-group > .form-control + .custom-file, + .input-group > .custom-select + .form-control, + .input-group > .custom-select + .custom-select, + .input-group > .custom-select + .custom-file, + .input-group > .custom-file + .form-control, + .input-group > .custom-file + .custom-select, + .input-group > .custom-file + .custom-file { + margin-left: -1px; } + .input-group > .form-control:not(:last-child), + .input-group > .custom-select:not(:last-child) { + border-top-right-radius: 0; + border-bottom-right-radius: 0; } + .input-group > .form-control:not(:first-child), + .input-group > .custom-select:not(:first-child) { + border-top-left-radius: 0; + border-bottom-left-radius: 0; } + .input-group > .custom-file { + display: flex; + align-items: center; } + .input-group > .custom-file:not(:last-child) .custom-file-label, + .input-group > .custom-file:not(:last-child) .custom-file-label::before { + border-top-right-radius: 0; + border-bottom-right-radius: 0; } + .input-group > .custom-file:not(:first-child) .custom-file-label, + .input-group > .custom-file:not(:first-child) .custom-file-label::before { + border-top-left-radius: 0; + border-bottom-left-radius: 0; } + +.input-group-prepend, +.input-group-append { + display: flex; } + .input-group-prepend .btn, + .input-group-append .btn { + position: relative; + z-index: 2; } + .input-group-prepend .btn + .btn, + .input-group-prepend .btn + .input-group-text, + .input-group-prepend .input-group-text + .input-group-text, + .input-group-prepend .input-group-text + .btn, + .input-group-append .btn + .btn, + .input-group-append .btn + .input-group-text, + .input-group-append .input-group-text + .input-group-text, + .input-group-append .input-group-text + .btn { + margin-left: -1px; } + +.input-group-prepend { + margin-right: -1px; } + +.input-group-append { + margin-left: -1px; } + +.input-group-text { + display: flex; + align-items: center; + padding: 0.375rem 0.75rem; + margin-bottom: 0; + font-size: 1rem; + font-weight: 200; + line-height: 1.5; + color: #495057; + text-align: center; + white-space: nowrap; + background-color: #e9ecef; + border: 1px solid #ced4da; + border-radius: 0.25rem; } + .input-group-text input[type="radio"], + .input-group-text input[type="checkbox"] { + margin-top: 0; } + +.input-group > .input-group-prepend > .btn, +.input-group > .input-group-prepend > .input-group-text, +.input-group > .input-group-append:not(:last-child) > .btn, +.input-group > .input-group-append:not(:last-child) > .input-group-text, +.input-group > .input-group-append:last-child > .btn:not(:last-child):not(.dropdown-toggle), +.input-group > .input-group-append:last-child > .input-group-text:not(:last-child) { + border-top-right-radius: 0; + border-bottom-right-radius: 0; } + +.input-group > .input-group-append > .btn, +.input-group > .input-group-append > .input-group-text, +.input-group > .input-group-prepend:not(:first-child) > .btn, +.input-group > .input-group-prepend:not(:first-child) > .input-group-text, +.input-group > .input-group-prepend:first-child > .btn:not(:first-child), +.input-group > .input-group-prepend:first-child > .input-group-text:not(:first-child) { + border-top-left-radius: 0; + border-bottom-left-radius: 0; } + +.custom-control { + position: relative; + display: block; + min-height: 1.5rem; + padding-left: 1.5rem; } + +.custom-control-inline { + display: inline-flex; + margin-right: 1rem; } + +.custom-control-input { + position: absolute; + z-index: -1; + opacity: 0; } + .custom-control-input:checked ~ .custom-control-label::before { + color: #fff; + background-color: #1f3333; + box-shadow: none; } + .custom-control-input:focus ~ .custom-control-label::before { + box-shadow: 0 0 0 1px white, 0 0 0 0.2rem rgba(31, 51, 51, 0.25); } + .custom-control-input:active ~ .custom-control-label::before { + color: #fff; + background-color: #64a1a1; + box-shadow: none; } + .custom-control-input:disabled ~ .custom-control-label { + color: #6c757d; } + .custom-control-input:disabled ~ .custom-control-label::before { + background-color: #e9ecef; } + +.custom-control-label { + margin-bottom: 0; } + .custom-control-label::before { + position: absolute; + top: 0.25rem; + left: 0; + display: block; + width: 1rem; + height: 1rem; + pointer-events: none; + content: ""; + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; + background-color: #dee2e6; + box-shadow: inset 0 0.25rem 0.25rem rgba(0, 0, 0, 0.1); } + .custom-control-label::after { + position: absolute; + top: 0.25rem; + left: 0; + display: block; + width: 1rem; + height: 1rem; + content: ""; + background-repeat: no-repeat; + background-position: center center; + background-size: 50% 50%; } + +.custom-checkbox .custom-control-label::before { + border-radius: 0.25rem; } + +.custom-checkbox .custom-control-input:checked ~ .custom-control-label::before { + background-color: #1f3333; } + +.custom-checkbox .custom-control-input:checked ~ .custom-control-label::after { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 8 8'%3E%3Cpath fill='%23fff' d='M6.564.75l-3.59 3.612-1.538-1.55L0 4.26 2.974 7.25 8 2.193z'/%3E%3C/svg%3E"); } + +.custom-checkbox .custom-control-input:indeterminate ~ .custom-control-label::before { + background-color: #1f3333; + box-shadow: none; } + +.custom-checkbox .custom-control-input:indeterminate ~ .custom-control-label::after { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 4'%3E%3Cpath stroke='%23fff' d='M0 2h4'/%3E%3C/svg%3E"); } + +.custom-checkbox .custom-control-input:disabled:checked ~ .custom-control-label::before { + background-color: rgba(31, 51, 51, 0.5); } + +.custom-checkbox .custom-control-input:disabled:indeterminate ~ .custom-control-label::before { + background-color: rgba(31, 51, 51, 0.5); } + +.custom-radio .custom-control-label::before { + border-radius: 50%; } + +.custom-radio .custom-control-input:checked ~ .custom-control-label::before { + background-color: #1f3333; } + +.custom-radio .custom-control-input:checked ~ .custom-control-label::after { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='-4 -4 8 8'%3E%3Ccircle r='3' fill='%23fff'/%3E%3C/svg%3E"); } + +.custom-radio .custom-control-input:disabled:checked ~ .custom-control-label::before { + background-color: rgba(31, 51, 51, 0.5); } + +.custom-select { + display: inline-block; + width: 100%; + height: calc(2.25rem + 2px); + padding: 0.375rem 1.75rem 0.375rem 0.75rem; + line-height: 1.5; + color: #495057; + vertical-align: middle; + background: #fff url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 4 5'%3E%3Cpath fill='%23343a40' d='M2 0L0 2h4zm0 5L0 3h4z'/%3E%3C/svg%3E") no-repeat right 0.75rem center; + background-size: 8px 10px; + border: 1px solid #ced4da; + border-radius: 0.25rem; + -webkit-appearance: none; + -moz-appearance: none; + appearance: none; } + .custom-select:focus { + border-color: #4f8282; + outline: 0; + box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.075), 0 0 5px rgba(79, 130, 130, 0.5); } + .custom-select:focus::-ms-value { + color: #495057; + background-color: #fff; } + .custom-select[multiple], .custom-select[size]:not([size="1"]) { + height: auto; + padding-right: 0.75rem; + background-image: none; } + .custom-select:disabled { + color: #6c757d; + background-color: #e9ecef; } + .custom-select::-ms-expand { + opacity: 0; } + +.custom-select-sm { + height: calc(1.8125rem + 2px); + padding-top: 0.375rem; + padding-bottom: 0.375rem; + font-size: 75%; } + +.custom-select-lg { + height: calc(2.875rem + 2px); + padding-top: 0.375rem; + padding-bottom: 0.375rem; + font-size: 125%; } + +.custom-file { + position: relative; + display: inline-block; + width: 100%; + height: calc(2.25rem + 2px); + margin-bottom: 0; } + +.custom-file-input { + position: relative; + z-index: 2; + width: 100%; + height: calc(2.25rem + 2px); + margin: 0; + opacity: 0; } + .custom-file-input:focus ~ .custom-file-control { + border-color: #4f8282; + box-shadow: 0 0 0 0.2rem rgba(31, 51, 51, 0.25); } + .custom-file-input:focus ~ .custom-file-control::before { + border-color: #4f8282; } + .custom-file-input:lang(en) ~ .custom-file-label::after { + content: "Browse"; } + +.custom-file-label { + position: absolute; + top: 0; + right: 0; + left: 0; + z-index: 1; + height: calc(2.25rem + 2px); + padding: 0.375rem 0.75rem; + line-height: 1.5; + color: #495057; + background-color: #fff; + border: 1px solid #ced4da; + border-radius: 0.25rem; + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); } + .custom-file-label::after { + position: absolute; + top: 0; + right: 0; + bottom: 0; + z-index: 3; + display: block; + height: calc(calc(2.25rem + 2px) - 1px * 2); + padding: 0.375rem 0.75rem; + line-height: 1.5; + color: #495057; + content: "Browse"; + background-color: #e9ecef; + border-left: 1px solid #ced4da; + border-radius: 0 0.25rem 0.25rem 0; } + +.nav { + display: flex; + flex-wrap: wrap; + padding-left: 0; + margin-bottom: 0; + list-style: none; } + +.nav-link { + display: block; + padding: 0.5rem 1rem; } + .nav-link:hover, .nav-link:focus { + text-decoration: none; } + .nav-link.disabled { + color: #6c757d; } + +.nav-tabs { + border-bottom: 1px solid #dee2e6; } + .nav-tabs .nav-item { + margin-bottom: -1px; } + .nav-tabs .nav-link { + border: 1px solid transparent; + border-top-left-radius: 0.25rem; + border-top-right-radius: 0.25rem; } + .nav-tabs .nav-link:hover, .nav-tabs .nav-link:focus { + border-color: #e9ecef #e9ecef #dee2e6; } + .nav-tabs .nav-link.disabled { + color: #6c757d; + background-color: transparent; + border-color: transparent; } + .nav-tabs .nav-link.active, + .nav-tabs .nav-item.show .nav-link { + color: #495057; + background-color: white; + border-color: #dee2e6 #dee2e6 white; } + .nav-tabs .dropdown-menu { + margin-top: -1px; + border-top-left-radius: 0; + border-top-right-radius: 0; } + +.nav-pills .nav-link { + border-radius: 0.25rem; } + +.nav-pills .nav-link.active, +.nav-pills .show > .nav-link { + color: #fff; + background-color: #1f3333; } + +.nav-fill .nav-item { + flex: 1 1 auto; + text-align: center; } + +.nav-justified .nav-item { + flex-basis: 0; + flex-grow: 1; + text-align: center; } + +.tab-content > .tab-pane { + display: none; } + +.tab-content > .active { + display: block; } + +.navbar { + position: relative; + display: flex; + flex-wrap: wrap; + align-items: center; + justify-content: space-between; + padding: 0.75rem 1.5rem; } + .navbar > .container, + .navbar > .container-fluid { + display: flex; + flex-wrap: wrap; + align-items: center; + justify-content: space-between; } + +.navbar-brand { + display: inline-block; + padding-top: 0.3125rem; + padding-bottom: 0.3125rem; + margin-right: 1.5rem; + font-size: 1.25rem; + line-height: inherit; + white-space: nowrap; } + .navbar-brand:hover, .navbar-brand:focus { + text-decoration: none; } + +.navbar-nav { + display: flex; + flex-direction: column; + padding-left: 0; + margin-bottom: 0; + list-style: none; } + .navbar-nav .nav-link { + padding-right: 0; + padding-left: 0; } + .navbar-nav .dropdown-menu { + position: static; + float: none; } + +.navbar-text { + display: inline-block; + padding-top: 0.5rem; + padding-bottom: 0.5rem; } + +.navbar-collapse { + flex-basis: 100%; + flex-grow: 1; + align-items: center; } + +.navbar-toggler { + padding: 0.25rem 0.75rem; + font-size: 1.25rem; + line-height: 1; + background-color: transparent; + border: 1px solid transparent; + border-radius: 0.25rem; } + .navbar-toggler:hover, .navbar-toggler:focus { + text-decoration: none; } + .navbar-toggler:not(:disabled):not(.disabled) { + cursor: pointer; } + +.navbar-toggler-icon { + display: inline-block; + width: 1.5em; + height: 1.5em; + vertical-align: middle; + content: ""; + background: no-repeat center center; + background-size: 100% 100%; } + +@media (max-width: 575.98px) { + .navbar-expand-sm > .container, + .navbar-expand-sm > .container-fluid { + padding-right: 0; + padding-left: 0; } } + +@media (min-width: 576px) { + .navbar-expand-sm { + flex-flow: row nowrap; + justify-content: flex-start; } + .navbar-expand-sm .navbar-nav { + flex-direction: row; } + .navbar-expand-sm .navbar-nav .dropdown-menu { + position: absolute; } + .navbar-expand-sm .navbar-nav .dropdown-menu-right { + right: 0; + left: auto; } + .navbar-expand-sm .navbar-nav .nav-link { + padding-right: 0.5rem; + padding-left: 0.5rem; } + .navbar-expand-sm > .container, + .navbar-expand-sm > .container-fluid { + flex-wrap: nowrap; } + .navbar-expand-sm .navbar-collapse { + display: flex !important; + flex-basis: auto; } + .navbar-expand-sm .navbar-toggler { + display: none; } + .navbar-expand-sm .dropup .dropdown-menu { + top: auto; + bottom: 100%; } } + +@media (max-width: 767.98px) { + .navbar-expand-md > .container, + .navbar-expand-md > .container-fluid { + padding-right: 0; + padding-left: 0; } } + +@media (min-width: 768px) { + .navbar-expand-md { + flex-flow: row nowrap; + justify-content: flex-start; } + .navbar-expand-md .navbar-nav { + flex-direction: row; } + .navbar-expand-md .navbar-nav .dropdown-menu { + position: absolute; } + .navbar-expand-md .navbar-nav .dropdown-menu-right { + right: 0; + left: auto; } + .navbar-expand-md .navbar-nav .nav-link { + padding-right: 0.5rem; + padding-left: 0.5rem; } + .navbar-expand-md > .container, + .navbar-expand-md > .container-fluid { + flex-wrap: nowrap; } + .navbar-expand-md .navbar-collapse { + display: flex !important; + flex-basis: auto; } + .navbar-expand-md .navbar-toggler { + display: none; } + .navbar-expand-md .dropup .dropdown-menu { + top: auto; + bottom: 100%; } } + +@media (max-width: 991.98px) { + .navbar-expand-lg > .container, + .navbar-expand-lg > .container-fluid { + padding-right: 0; + padding-left: 0; } } + +@media (min-width: 992px) { + .navbar-expand-lg { + flex-flow: row nowrap; + justify-content: flex-start; } + .navbar-expand-lg .navbar-nav { + flex-direction: row; } + .navbar-expand-lg .navbar-nav .dropdown-menu { + position: absolute; } + .navbar-expand-lg .navbar-nav .dropdown-menu-right { + right: 0; + left: auto; } + .navbar-expand-lg .navbar-nav .nav-link { + padding-right: 0.5rem; + padding-left: 0.5rem; } + .navbar-expand-lg > .container, + .navbar-expand-lg > .container-fluid { + flex-wrap: nowrap; } + .navbar-expand-lg .navbar-collapse { + display: flex !important; + flex-basis: auto; } + .navbar-expand-lg .navbar-toggler { + display: none; } + .navbar-expand-lg .dropup .dropdown-menu { + top: auto; + bottom: 100%; } } + +@media (max-width: 1199.98px) { + .navbar-expand-xl > .container, + .navbar-expand-xl > .container-fluid { + padding-right: 0; + padding-left: 0; } } + +@media (min-width: 1200px) { + .navbar-expand-xl { + flex-flow: row nowrap; + justify-content: flex-start; } + .navbar-expand-xl .navbar-nav { + flex-direction: row; } + .navbar-expand-xl .navbar-nav .dropdown-menu { + position: absolute; } + .navbar-expand-xl .navbar-nav .dropdown-menu-right { + right: 0; + left: auto; } + .navbar-expand-xl .navbar-nav .nav-link { + padding-right: 0.5rem; + padding-left: 0.5rem; } + .navbar-expand-xl > .container, + .navbar-expand-xl > .container-fluid { + flex-wrap: nowrap; } + .navbar-expand-xl .navbar-collapse { + display: flex !important; + flex-basis: auto; } + .navbar-expand-xl .navbar-toggler { + display: none; } + .navbar-expand-xl .dropup .dropdown-menu { + top: auto; + bottom: 100%; } } + +.navbar-expand { + flex-flow: row nowrap; + justify-content: flex-start; } + .navbar-expand > .container, + .navbar-expand > .container-fluid { + padding-right: 0; + padding-left: 0; } + .navbar-expand .navbar-nav { + flex-direction: row; } + .navbar-expand .navbar-nav .dropdown-menu { + position: absolute; } + .navbar-expand .navbar-nav .dropdown-menu-right { + right: 0; + left: auto; } + .navbar-expand .navbar-nav .nav-link { + padding-right: 0.5rem; + padding-left: 0.5rem; } + .navbar-expand > .container, + .navbar-expand > .container-fluid { + flex-wrap: nowrap; } + .navbar-expand .navbar-collapse { + display: flex !important; + flex-basis: auto; } + .navbar-expand .navbar-toggler { + display: none; } + .navbar-expand .dropup .dropdown-menu { + top: auto; + bottom: 100%; } + +.navbar-light .navbar-brand { + color: rgba(0, 0, 0, 0.9); } + .navbar-light .navbar-brand:hover, .navbar-light .navbar-brand:focus { + color: rgba(0, 0, 0, 0.9); } + +.navbar-light .navbar-nav .nav-link { + color: rgba(0, 0, 0, 0.5); } + .navbar-light .navbar-nav .nav-link:hover, .navbar-light .navbar-nav .nav-link:focus { + color: rgba(0, 0, 0, 0.7); } + .navbar-light .navbar-nav .nav-link.disabled { + color: rgba(0, 0, 0, 0.3); } + +.navbar-light .navbar-nav .show > .nav-link, +.navbar-light .navbar-nav .active > .nav-link, +.navbar-light .navbar-nav .nav-link.show, +.navbar-light .navbar-nav .nav-link.active { + color: rgba(0, 0, 0, 0.9); } + +.navbar-light .navbar-toggler { + color: rgba(0, 0, 0, 0.5); + border-color: rgba(0, 0, 0, 0.1); } + +.navbar-light .navbar-toggler-icon { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(0, 0, 0, 0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3E%3C/svg%3E"); } + +.navbar-light .navbar-text { + color: rgba(0, 0, 0, 0.5); } + .navbar-light .navbar-text a { + color: rgba(0, 0, 0, 0.9); } + .navbar-light .navbar-text a:hover, .navbar-light .navbar-text a:focus { + color: rgba(0, 0, 0, 0.9); } + +.navbar-dark .navbar-brand { + color: #fff; } + .navbar-dark .navbar-brand:hover, .navbar-dark .navbar-brand:focus { + color: #fff; } + +.navbar-dark .navbar-nav .nav-link { + color: rgba(255, 255, 255, 0.5); } + .navbar-dark .navbar-nav .nav-link:hover, .navbar-dark .navbar-nav .nav-link:focus { + color: rgba(255, 255, 255, 0.75); } + .navbar-dark .navbar-nav .nav-link.disabled { + color: rgba(255, 255, 255, 0.25); } + +.navbar-dark .navbar-nav .show > .nav-link, +.navbar-dark .navbar-nav .active > .nav-link, +.navbar-dark .navbar-nav .nav-link.show, +.navbar-dark .navbar-nav .nav-link.active { + color: #fff; } + +.navbar-dark .navbar-toggler { + color: rgba(255, 255, 255, 0.5); + border-color: rgba(255, 255, 255, 0.1); } + +.navbar-dark .navbar-toggler-icon { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 30 30' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(255, 255, 255, 0.5)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 7h22M4 15h22M4 23h22'/%3E%3C/svg%3E"); } + +.navbar-dark .navbar-text { + color: rgba(255, 255, 255, 0.5); } + .navbar-dark .navbar-text a { + color: #fff; } + .navbar-dark .navbar-text a:hover, .navbar-dark .navbar-text a:focus { + color: #fff; } + +.card { + position: relative; + display: flex; + flex-direction: column; + min-width: 0; + word-wrap: break-word; + background-color: #fff; + background-clip: border-box; + border: 1px solid rgba(0, 0, 0, 0.125); + border-radius: 0.25rem; } + .card > hr { + margin-right: 0; + margin-left: 0; } + .card > .list-group:first-child .list-group-item:first-child { + border-top-left-radius: 0.25rem; + border-top-right-radius: 0.25rem; } + .card > .list-group:last-child .list-group-item:last-child { + border-bottom-right-radius: 0.25rem; + border-bottom-left-radius: 0.25rem; } + +.card-body { + flex: 1 1 auto; + padding: 1.25rem; } + +.card-title { + margin-bottom: 0.75rem; } + +.card-subtitle { + margin-top: -0.375rem; + margin-bottom: 0; } + +.card-text:last-child { + margin-bottom: 0; } + +.card-link:hover { + text-decoration: none; } + +.card-link + .card-link { + margin-left: 1.25rem; } + +.card-header { + padding: 0.75rem 1.25rem; + margin-bottom: 0; + background-color: rgba(0, 0, 0, 0.03); + border-bottom: 1px solid rgba(0, 0, 0, 0.125); } + .card-header:first-child { + border-radius: calc(0.25rem - 1px) calc(0.25rem - 1px) 0 0; } + .card-header + .list-group .list-group-item:first-child { + border-top: 0; } + +.card-footer { + padding: 0.75rem 1.25rem; + background-color: rgba(0, 0, 0, 0.03); + border-top: 1px solid rgba(0, 0, 0, 0.125); } + .card-footer:last-child { + border-radius: 0 0 calc(0.25rem - 1px) calc(0.25rem - 1px); } + +.card-header-tabs { + margin-right: -0.625rem; + margin-bottom: -0.75rem; + margin-left: -0.625rem; + border-bottom: 0; } + +.card-header-pills { + margin-right: -0.625rem; + margin-left: -0.625rem; } + +.card-img-overlay { + position: absolute; + top: 0; + right: 0; + bottom: 0; + left: 0; + padding: 1.25rem; } + +.card-img { + width: 100%; + border-radius: calc(0.25rem - 1px); } + +.card-img-top { + width: 100%; + border-top-left-radius: calc(0.25rem - 1px); + border-top-right-radius: calc(0.25rem - 1px); } + +.card-img-bottom { + width: 100%; + border-bottom-right-radius: calc(0.25rem - 1px); + border-bottom-left-radius: calc(0.25rem - 1px); } + +.card-deck { + display: flex; + flex-direction: column; } + .card-deck .card { + margin-bottom: 15px; } + @media (min-width: 576px) { + .card-deck { + flex-flow: row wrap; + margin-right: -15px; + margin-left: -15px; } + .card-deck .card { + display: flex; + flex: 1 0 0%; + flex-direction: column; + margin-right: 15px; + margin-bottom: 0; + margin-left: 15px; } } + +.card-group { + display: flex; + flex-direction: column; } + .card-group > .card { + margin-bottom: 15px; } + @media (min-width: 576px) { + .card-group { + flex-flow: row wrap; } + .card-group > .card { + flex: 1 0 0%; + margin-bottom: 0; } + .card-group > .card + .card { + margin-left: 0; + border-left: 0; } + .card-group > .card:first-child { + border-top-right-radius: 0; + border-bottom-right-radius: 0; } + .card-group > .card:first-child .card-img-top, + .card-group > .card:first-child .card-header { + border-top-right-radius: 0; } + .card-group > .card:first-child .card-img-bottom, + .card-group > .card:first-child .card-footer { + border-bottom-right-radius: 0; } + .card-group > .card:last-child { + border-top-left-radius: 0; + border-bottom-left-radius: 0; } + .card-group > .card:last-child .card-img-top, + .card-group > .card:last-child .card-header { + border-top-left-radius: 0; } + .card-group > .card:last-child .card-img-bottom, + .card-group > .card:last-child .card-footer { + border-bottom-left-radius: 0; } + .card-group > .card:only-child { + border-radius: 0.25rem; } + .card-group > .card:only-child .card-img-top, + .card-group > .card:only-child .card-header { + border-top-left-radius: 0.25rem; + border-top-right-radius: 0.25rem; } + .card-group > .card:only-child .card-img-bottom, + .card-group > .card:only-child .card-footer { + border-bottom-right-radius: 0.25rem; + border-bottom-left-radius: 0.25rem; } + .card-group > .card:not(:first-child):not(:last-child):not(:only-child) { + border-radius: 0; } + .card-group > .card:not(:first-child):not(:last-child):not(:only-child) .card-img-top, + .card-group > .card:not(:first-child):not(:last-child):not(:only-child) .card-img-bottom, + .card-group > .card:not(:first-child):not(:last-child):not(:only-child) .card-header, + .card-group > .card:not(:first-child):not(:last-child):not(:only-child) .card-footer { + border-radius: 0; } } + +.card-columns .card { + margin-bottom: 0.75rem; } + +@media (min-width: 576px) { + .card-columns { + -webkit-column-count: 3; + column-count: 3; + -webkit-column-gap: 1.25rem; + column-gap: 1.25rem; } + .card-columns .card { + display: inline-block; + width: 100%; } } + +.breadcrumb { + display: flex; + flex-wrap: wrap; + padding: 0.75rem 1rem; + margin-bottom: 1rem; + list-style: none; + background-color: #e9ecef; + border-radius: 0.25rem; } + +.breadcrumb-item + .breadcrumb-item::before { + display: inline-block; + padding-right: 0.5rem; + padding-left: 0.5rem; + color: #6c757d; + content: "/"; } + +.breadcrumb-item + .breadcrumb-item:hover::before { + text-decoration: underline; } + +.breadcrumb-item + .breadcrumb-item:hover::before { + text-decoration: none; } + +.breadcrumb-item.active { + color: #6c757d; } + +.pagination { + display: flex; + padding-left: 0; + list-style: none; + border-radius: 0.25rem; } + +.page-link { + position: relative; + display: block; + padding: 0.5rem 0.75rem; + margin-left: -1px; + line-height: 1.25; + color: #1b807c; + background-color: #fff; + border: 1px solid #dee2e6; } + .page-link:hover { + color: #0e413f; + text-decoration: none; + background-color: #e9ecef; + border-color: #dee2e6; } + .page-link:focus { + z-index: 2; + outline: 0; + box-shadow: 0 0 0 0.2rem rgba(31, 51, 51, 0.25); } + .page-link:not(:disabled):not(.disabled) { + cursor: pointer; } + +.page-item:first-child .page-link { + margin-left: 0; + border-top-left-radius: 0.25rem; + border-bottom-left-radius: 0.25rem; } + +.page-item:last-child .page-link { + border-top-right-radius: 0.25rem; + border-bottom-right-radius: 0.25rem; } + +.page-item.active .page-link { + z-index: 1; + color: #fff; + background-color: #1f3333; + border-color: #1f3333; } + +.page-item.disabled .page-link { + color: #6c757d; + pointer-events: none; + cursor: auto; + background-color: #fff; + border-color: #dee2e6; } + +.pagination-lg .page-link { + padding: 0.75rem 1.5rem; + font-size: 1.25rem; + line-height: 1.5; } + +.pagination-lg .page-item:first-child .page-link { + border-top-left-radius: 0.3rem; + border-bottom-left-radius: 0.3rem; } + +.pagination-lg .page-item:last-child .page-link { + border-top-right-radius: 0.3rem; + border-bottom-right-radius: 0.3rem; } + +.pagination-sm .page-link { + padding: 0.25rem 0.5rem; + font-size: 0.875rem; + line-height: 1.5; } + +.pagination-sm .page-item:first-child .page-link { + border-top-left-radius: 0.2rem; + border-bottom-left-radius: 0.2rem; } + +.pagination-sm .page-item:last-child .page-link { + border-top-right-radius: 0.2rem; + border-bottom-right-radius: 0.2rem; } + +.badge { + display: inline-block; + padding: 0.25em 0.4em; + font-size: 75%; + font-weight: 700; + line-height: 1; + text-align: center; + white-space: nowrap; + vertical-align: baseline; + border-radius: 0.25rem; } + .badge:empty { + display: none; } + +.btn .badge { + position: relative; + top: -1px; } + +.badge-pill { + padding-right: 0.6em; + padding-left: 0.6em; + border-radius: 10rem; } + +.badge-primary { + color: #fff; + background-color: #1f3333; } + .badge-primary[href]:hover, .badge-primary[href]:focus { + color: #fff; + text-decoration: none; + background-color: #0c1313; } + +.badge-secondary { + color: #fff; + background-color: #1b807c; } + .badge-secondary[href]:hover, .badge-secondary[href]:focus { + color: #fff; + text-decoration: none; + background-color: #125653; } + +.badge-success { + color: #fff; + background-color: #28a745; } + .badge-success[href]:hover, .badge-success[href]:focus { + color: #fff; + text-decoration: none; + background-color: #1e7e34; } + +.badge-info { + color: #212529; + background-color: #ccc; } + .badge-info[href]:hover, .badge-info[href]:focus { + color: #212529; + text-decoration: none; + background-color: #b3b3b3; } + +.badge-warning { + color: #212529; + background-color: #ffc107; } + .badge-warning[href]:hover, .badge-warning[href]:focus { + color: #212529; + text-decoration: none; + background-color: #d39e00; } + +.badge-danger { + color: #fff; + background-color: #dc3545; } + .badge-danger[href]:hover, .badge-danger[href]:focus { + color: #fff; + text-decoration: none; + background-color: #bd2130; } + +.badge-light { + color: #212529; + background-color: #f3f3f3; } + .badge-light[href]:hover, .badge-light[href]:focus { + color: #212529; + text-decoration: none; + background-color: #dadada; } + +.badge-dark { + color: #fff; + background-color: #151515; } + .badge-dark[href]:hover, .badge-dark[href]:focus { + color: #fff; + text-decoration: none; + background-color: black; } + +.jumbotron { + padding: 2rem 1rem; + margin-bottom: 2rem; + background-color: #e9ecef; + border-radius: 0.3rem; } + @media (min-width: 576px) { + .jumbotron { + padding: 4rem 2rem; } } + +.jumbotron-fluid { + padding-right: 0; + padding-left: 0; + border-radius: 0; } + +.alert { + position: relative; + padding: 0.75rem 1.25rem; + margin-bottom: 1rem; + border: 1px solid transparent; + border-radius: 0.25rem; } + +.alert-heading { + color: inherit; } + +.alert-link { + font-weight: 700; } + +.alert-dismissible { + padding-right: 4rem; } + .alert-dismissible .close { + position: absolute; + top: 0; + right: 0; + padding: 0.75rem 1.25rem; + color: inherit; } + +.alert-primary { + color: #101b1b; + background-color: #d2d6d6; + border-color: #c0c6c6; } + .alert-primary hr { + border-top-color: #b3baba; } + .alert-primary .alert-link { + color: black; } + +.alert-secondary { + color: #0e4340; + background-color: #d1e6e5; + border-color: #bfdbda; } + .alert-secondary hr { + border-top-color: #afd2d1; } + .alert-secondary .alert-link { + color: #051918; } + +.alert-success { + color: #155724; + background-color: #d4edda; + border-color: #c3e6cb; } + .alert-success hr { + border-top-color: #b1dfbb; } + .alert-success .alert-link { + color: #0b2e13; } + +.alert-info { + color: #6a6a6a; + background-color: whitesmoke; + border-color: #f1f1f1; } + .alert-info hr { + border-top-color: #e4e4e4; } + .alert-info .alert-link { + color: #515151; } + +.alert-warning { + color: #856404; + background-color: #fff3cd; + border-color: #ffeeba; } + .alert-warning hr { + border-top-color: #ffe8a1; } + .alert-warning .alert-link { + color: #533f03; } + +.alert-danger { + color: #721c24; + background-color: #f8d7da; + border-color: #f5c6cb; } + .alert-danger hr { + border-top-color: #f1b0b7; } + .alert-danger .alert-link { + color: #491217; } + +.alert-light { + color: #7e7e7e; + background-color: #fdfdfd; + border-color: #fcfcfc; } + .alert-light hr { + border-top-color: #efefef; } + .alert-light .alert-link { + color: #656565; } + +.alert-dark { + color: #0b0b0b; + background-color: #d0d0d0; + border-color: #bdbdbd; } + .alert-dark hr { + border-top-color: #b0b0b0; } + .alert-dark .alert-link { + color: black; } + +@-webkit-keyframes progress-bar-stripes { + from { + background-position: 1rem 0; } + to { + background-position: 0 0; } } + +@keyframes progress-bar-stripes { + from { + background-position: 1rem 0; } + to { + background-position: 0 0; } } + +.progress { + display: flex; + height: 1rem; + overflow: hidden; + font-size: 0.75rem; + background-color: #e9ecef; + border-radius: 0.25rem; + box-shadow: inset 0 0.1rem 0.1rem rgba(0, 0, 0, 0.1); } + +.progress-bar { + display: flex; + flex-direction: column; + justify-content: center; + color: #fff; + text-align: center; + background-color: #1f3333; + transition: width 0.6s ease; } + +.progress-bar-striped { + background-image: linear-gradient(45deg, rgba(255, 255, 255, 0.15) 25%, transparent 25%, transparent 50%, rgba(255, 255, 255, 0.15) 50%, rgba(255, 255, 255, 0.15) 75%, transparent 75%, transparent); + background-size: 1rem 1rem; } + +.progress-bar-animated { + -webkit-animation: progress-bar-stripes 1s linear infinite; + animation: progress-bar-stripes 1s linear infinite; } + +.media { + display: flex; + align-items: flex-start; } + +.media-body { + flex: 1; } + +.list-group { + display: flex; + flex-direction: column; + padding-left: 0; + margin-bottom: 0; } + +.list-group-item-action { + width: 100%; + color: #495057; + text-align: inherit; } + .list-group-item-action:hover, .list-group-item-action:focus { + color: #495057; + text-decoration: none; + background-color: #f8f9fa; } + .list-group-item-action:active { + color: #1f3333; + background-color: #e9ecef; } + +.list-group-item { + position: relative; + display: block; + padding: 0.75rem 1.25rem; + margin-bottom: -1px; + background-color: #fff; + border: 1px solid rgba(0, 0, 0, 0.125); } + .list-group-item:first-child { + border-top-left-radius: 0.25rem; + border-top-right-radius: 0.25rem; } + .list-group-item:last-child { + margin-bottom: 0; + border-bottom-right-radius: 0.25rem; + border-bottom-left-radius: 0.25rem; } + .list-group-item:hover, .list-group-item:focus { + z-index: 1; + text-decoration: none; } + .list-group-item.disabled, .list-group-item:disabled { + color: #6c757d; + background-color: #fff; } + .list-group-item.active { + z-index: 2; + color: #fff; + background-color: #1f3333; + border-color: #1f3333; } + +.list-group-flush .list-group-item { + border-right: 0; + border-left: 0; + border-radius: 0; } + +.list-group-flush:first-child .list-group-item:first-child { + border-top: 0; } + +.list-group-flush:last-child .list-group-item:last-child { + border-bottom: 0; } + +.list-group-item-primary { + color: #101b1b; + background-color: #c0c6c6; } + .list-group-item-primary.list-group-item-action:hover, .list-group-item-primary.list-group-item-action:focus { + color: #101b1b; + background-color: #b3baba; } + .list-group-item-primary.list-group-item-action.active { + color: #fff; + background-color: #101b1b; + border-color: #101b1b; } + +.list-group-item-secondary { + color: #0e4340; + background-color: #bfdbda; } + .list-group-item-secondary.list-group-item-action:hover, .list-group-item-secondary.list-group-item-action:focus { + color: #0e4340; + background-color: #afd2d1; } + .list-group-item-secondary.list-group-item-action.active { + color: #fff; + background-color: #0e4340; + border-color: #0e4340; } + +.list-group-item-success { + color: #155724; + background-color: #c3e6cb; } + .list-group-item-success.list-group-item-action:hover, .list-group-item-success.list-group-item-action:focus { + color: #155724; + background-color: #b1dfbb; } + .list-group-item-success.list-group-item-action.active { + color: #fff; + background-color: #155724; + border-color: #155724; } + +.list-group-item-info { + color: #6a6a6a; + background-color: #f1f1f1; } + .list-group-item-info.list-group-item-action:hover, .list-group-item-info.list-group-item-action:focus { + color: #6a6a6a; + background-color: #e4e4e4; } + .list-group-item-info.list-group-item-action.active { + color: #fff; + background-color: #6a6a6a; + border-color: #6a6a6a; } + +.list-group-item-warning { + color: #856404; + background-color: #ffeeba; } + .list-group-item-warning.list-group-item-action:hover, .list-group-item-warning.list-group-item-action:focus { + color: #856404; + background-color: #ffe8a1; } + .list-group-item-warning.list-group-item-action.active { + color: #fff; + background-color: #856404; + border-color: #856404; } + +.list-group-item-danger { + color: #721c24; + background-color: #f5c6cb; } + .list-group-item-danger.list-group-item-action:hover, .list-group-item-danger.list-group-item-action:focus { + color: #721c24; + background-color: #f1b0b7; } + .list-group-item-danger.list-group-item-action.active { + color: #fff; + background-color: #721c24; + border-color: #721c24; } + +.list-group-item-light { + color: #7e7e7e; + background-color: #fcfcfc; } + .list-group-item-light.list-group-item-action:hover, .list-group-item-light.list-group-item-action:focus { + color: #7e7e7e; + background-color: #efefef; } + .list-group-item-light.list-group-item-action.active { + color: #fff; + background-color: #7e7e7e; + border-color: #7e7e7e; } + +.list-group-item-dark { + color: #0b0b0b; + background-color: #bdbdbd; } + .list-group-item-dark.list-group-item-action:hover, .list-group-item-dark.list-group-item-action:focus { + color: #0b0b0b; + background-color: #b0b0b0; } + .list-group-item-dark.list-group-item-action.active { + color: #fff; + background-color: #0b0b0b; + border-color: #0b0b0b; } + +.close { + float: right; + font-size: 1.5rem; + font-weight: 700; + line-height: 1; + color: #000; + text-shadow: 0 1px 0 #fff; + opacity: .5; } + .close:hover, .close:focus { + color: #000; + text-decoration: none; + opacity: .75; } + .close:not(:disabled):not(.disabled) { + cursor: pointer; } + +button.close { + padding: 0; + background-color: transparent; + border: 0; + -webkit-appearance: none; } + +.modal-open { + overflow: hidden; } + +.modal { + position: fixed; + top: 0; + right: 0; + bottom: 0; + left: 0; + z-index: 1050; + display: none; + overflow: hidden; + outline: 0; } + .modal-open .modal { + overflow-x: hidden; + overflow-y: auto; } + +.modal-dialog { + position: relative; + width: auto; + margin: 0.5rem; + pointer-events: none; } + .modal.fade .modal-dialog { + transition: -webkit-transform 0.3s ease-out; + transition: transform 0.3s ease-out; + transition: transform 0.3s ease-out, -webkit-transform 0.3s ease-out; + -webkit-transform: translate(0, -25%); + transform: translate(0, -25%); } + .modal.show .modal-dialog { + -webkit-transform: translate(0, 0); + transform: translate(0, 0); } + +.modal-dialog-centered { + display: flex; + align-items: center; + min-height: calc(100% - (0.5rem * 2)); } + +.modal-content { + position: relative; + display: flex; + flex-direction: column; + width: 100%; + pointer-events: auto; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, 0.2); + border-radius: 0.3rem; + box-shadow: 0 0.25rem 0.5rem rgba(0, 0, 0, 0.5); + outline: 0; } + +.modal-backdrop { + position: fixed; + top: 0; + right: 0; + bottom: 0; + left: 0; + z-index: 1040; + background-color: #000; } + .modal-backdrop.fade { + opacity: 0; } + .modal-backdrop.show { + opacity: 0.5; } + +.modal-header { + display: flex; + align-items: flex-start; + justify-content: space-between; + padding: 1rem; + border-bottom: 1px solid #e9ecef; + border-top-left-radius: 0.3rem; + border-top-right-radius: 0.3rem; } + .modal-header .close { + padding: 1rem; + margin: -1rem -1rem -1rem auto; } + +.modal-title { + margin-bottom: 0; + line-height: 1.5; } + +.modal-body { + position: relative; + flex: 1 1 auto; + padding: 1rem; } + +.modal-footer { + display: flex; + align-items: center; + justify-content: flex-end; + padding: 1rem; + border-top: 1px solid #e9ecef; } + .modal-footer > :not(:first-child) { + margin-left: .25rem; } + .modal-footer > :not(:last-child) { + margin-right: .25rem; } + +.modal-scrollbar-measure { + position: absolute; + top: -9999px; + width: 50px; + height: 50px; + overflow: scroll; } + +@media (min-width: 576px) { + .modal-dialog { + max-width: 500px; + margin: 1.75rem auto; } + .modal-dialog-centered { + min-height: calc(100% - (1.75rem * 2)); } + .modal-content { + box-shadow: 0 0.5rem 1rem rgba(0, 0, 0, 0.5); } + .modal-sm { + max-width: 300px; } } + +@media (min-width: 992px) { + .modal-lg { + max-width: 800px; } } + +.tooltip { + position: absolute; + z-index: 1070; + display: block; + margin: 0; + font-family: "Noto Sans"; + font-style: normal; + font-weight: 200; + line-height: 1.5; + text-align: left; + text-align: start; + text-decoration: none; + text-shadow: none; + text-transform: none; + letter-spacing: normal; + word-break: normal; + word-spacing: normal; + white-space: normal; + line-break: auto; + font-size: 0.875rem; + word-wrap: break-word; + opacity: 0; } + .tooltip.show { + opacity: 0.9; } + .tooltip .arrow { + position: absolute; + display: block; + width: 0.8rem; + height: 0.4rem; } + .tooltip .arrow::before { + position: absolute; + content: ""; + border-color: transparent; + border-style: solid; } + +.bs-tooltip-top, .bs-tooltip-auto[x-placement^="top"] { + padding: 0.4rem 0; } + .bs-tooltip-top .arrow, .bs-tooltip-auto[x-placement^="top"] .arrow { + bottom: 0; } + .bs-tooltip-top .arrow::before, .bs-tooltip-auto[x-placement^="top"] .arrow::before { + top: 0; + border-width: 0.4rem 0.4rem 0; + border-top-color: #000; } + +.bs-tooltip-right, .bs-tooltip-auto[x-placement^="right"] { + padding: 0 0.4rem; } + .bs-tooltip-right .arrow, .bs-tooltip-auto[x-placement^="right"] .arrow { + left: 0; + width: 0.4rem; + height: 0.8rem; } + .bs-tooltip-right .arrow::before, .bs-tooltip-auto[x-placement^="right"] .arrow::before { + right: 0; + border-width: 0.4rem 0.4rem 0.4rem 0; + border-right-color: #000; } + +.bs-tooltip-bottom, .bs-tooltip-auto[x-placement^="bottom"] { + padding: 0.4rem 0; } + .bs-tooltip-bottom .arrow, .bs-tooltip-auto[x-placement^="bottom"] .arrow { + top: 0; } + .bs-tooltip-bottom .arrow::before, .bs-tooltip-auto[x-placement^="bottom"] .arrow::before { + bottom: 0; + border-width: 0 0.4rem 0.4rem; + border-bottom-color: #000; } + +.bs-tooltip-left, .bs-tooltip-auto[x-placement^="left"] { + padding: 0 0.4rem; } + .bs-tooltip-left .arrow, .bs-tooltip-auto[x-placement^="left"] .arrow { + right: 0; + width: 0.4rem; + height: 0.8rem; } + .bs-tooltip-left .arrow::before, .bs-tooltip-auto[x-placement^="left"] .arrow::before { + left: 0; + border-width: 0.4rem 0 0.4rem 0.4rem; + border-left-color: #000; } + +.tooltip-inner { + max-width: 200px; + padding: 0.25rem 0.5rem; + color: #fff; + text-align: center; + background-color: #000; + border-radius: 0.25rem; } + +.popover { + position: absolute; + top: 0; + left: 0; + z-index: 1060; + display: block; + max-width: 276px; + font-family: "Noto Sans"; + font-style: normal; + font-weight: 200; + line-height: 1.5; + text-align: left; + text-align: start; + text-decoration: none; + text-shadow: none; + text-transform: none; + letter-spacing: normal; + word-break: normal; + word-spacing: normal; + white-space: normal; + line-break: auto; + font-size: 0.875rem; + word-wrap: break-word; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, 0.2); + border-radius: 0.3rem; + box-shadow: 0 0.25rem 0.5rem rgba(0, 0, 0, 0.2); } + .popover .arrow { + position: absolute; + display: block; + width: 1rem; + height: 0.5rem; + margin: 0 0.3rem; } + .popover .arrow::before, .popover .arrow::after { + position: absolute; + display: block; + content: ""; + border-color: transparent; + border-style: solid; } + +.bs-popover-top, .bs-popover-auto[x-placement^="top"] { + margin-bottom: 0.5rem; } + .bs-popover-top .arrow, .bs-popover-auto[x-placement^="top"] .arrow { + bottom: calc((0.5rem + 1px) * -1); } + .bs-popover-top .arrow::before, .bs-popover-auto[x-placement^="top"] .arrow::before, + .bs-popover-top .arrow::after, + .bs-popover-auto[x-placement^="top"] .arrow::after { + border-width: 0.5rem 0.5rem 0; } + .bs-popover-top .arrow::before, .bs-popover-auto[x-placement^="top"] .arrow::before { + bottom: 0; + border-top-color: rgba(0, 0, 0, 0.25); } + + .bs-popover-top .arrow::after, + .bs-popover-auto[x-placement^="top"] .arrow::after { + bottom: 1px; + border-top-color: #fff; } + +.bs-popover-right, .bs-popover-auto[x-placement^="right"] { + margin-left: 0.5rem; } + .bs-popover-right .arrow, .bs-popover-auto[x-placement^="right"] .arrow { + left: calc((0.5rem + 1px) * -1); + width: 0.5rem; + height: 1rem; + margin: 0.3rem 0; } + .bs-popover-right .arrow::before, .bs-popover-auto[x-placement^="right"] .arrow::before, + .bs-popover-right .arrow::after, + .bs-popover-auto[x-placement^="right"] .arrow::after { + border-width: 0.5rem 0.5rem 0.5rem 0; } + .bs-popover-right .arrow::before, .bs-popover-auto[x-placement^="right"] .arrow::before { + left: 0; + border-right-color: rgba(0, 0, 0, 0.25); } + + .bs-popover-right .arrow::after, + .bs-popover-auto[x-placement^="right"] .arrow::after { + left: 1px; + border-right-color: #fff; } + +.bs-popover-bottom, .bs-popover-auto[x-placement^="bottom"] { + margin-top: 0.5rem; } + .bs-popover-bottom .arrow, .bs-popover-auto[x-placement^="bottom"] .arrow { + top: calc((0.5rem + 1px) * -1); } + .bs-popover-bottom .arrow::before, .bs-popover-auto[x-placement^="bottom"] .arrow::before, + .bs-popover-bottom .arrow::after, + .bs-popover-auto[x-placement^="bottom"] .arrow::after { + border-width: 0 0.5rem 0.5rem 0.5rem; } + .bs-popover-bottom .arrow::before, .bs-popover-auto[x-placement^="bottom"] .arrow::before { + top: 0; + border-bottom-color: rgba(0, 0, 0, 0.25); } + + .bs-popover-bottom .arrow::after, + .bs-popover-auto[x-placement^="bottom"] .arrow::after { + top: 1px; + border-bottom-color: #fff; } + .bs-popover-bottom .popover-header::before, .bs-popover-auto[x-placement^="bottom"] .popover-header::before { + position: absolute; + top: 0; + left: 50%; + display: block; + width: 1rem; + margin-left: -0.5rem; + content: ""; + border-bottom: 1px solid #f7f7f7; } + +.bs-popover-left, .bs-popover-auto[x-placement^="left"] { + margin-right: 0.5rem; } + .bs-popover-left .arrow, .bs-popover-auto[x-placement^="left"] .arrow { + right: calc((0.5rem + 1px) * -1); + width: 0.5rem; + height: 1rem; + margin: 0.3rem 0; } + .bs-popover-left .arrow::before, .bs-popover-auto[x-placement^="left"] .arrow::before, + .bs-popover-left .arrow::after, + .bs-popover-auto[x-placement^="left"] .arrow::after { + border-width: 0.5rem 0 0.5rem 0.5rem; } + .bs-popover-left .arrow::before, .bs-popover-auto[x-placement^="left"] .arrow::before { + right: 0; + border-left-color: rgba(0, 0, 0, 0.25); } + + .bs-popover-left .arrow::after, + .bs-popover-auto[x-placement^="left"] .arrow::after { + right: 1px; + border-left-color: #fff; } + +.popover-header { + padding: 0.5rem 0.75rem; + margin-bottom: 0; + font-size: 1rem; + color: inherit; + background-color: #f7f7f7; + border-bottom: 1px solid #ebebeb; + border-top-left-radius: calc(0.3rem - 1px); + border-top-right-radius: calc(0.3rem - 1px); } + .popover-header:empty { + display: none; } + +.popover-body { + padding: 0.5rem 0.75rem; + color: #1f3333; } + +.carousel { + position: relative; } + +.carousel-inner { + position: relative; + width: 100%; + overflow: hidden; } + +.carousel-item { + position: relative; + display: none; + align-items: center; + width: 100%; + transition: -webkit-transform 0.6s ease; + transition: transform 0.6s ease; + transition: transform 0.6s ease, -webkit-transform 0.6s ease; + -webkit-backface-visibility: hidden; + backface-visibility: hidden; + -webkit-perspective: 1000px; + perspective: 1000px; } + +.carousel-item.active, +.carousel-item-next, +.carousel-item-prev { + display: block; } + +.carousel-item-next, +.carousel-item-prev { + position: absolute; + top: 0; } + +.carousel-item-next.carousel-item-left, +.carousel-item-prev.carousel-item-right { + -webkit-transform: translateX(0); + transform: translateX(0); } + @supports ((-webkit-transform-style: preserve-3d) or (transform-style: preserve-3d)) { + .carousel-item-next.carousel-item-left, + .carousel-item-prev.carousel-item-right { + -webkit-transform: translate3d(0, 0, 0); + transform: translate3d(0, 0, 0); } } + +.carousel-item-next, +.active.carousel-item-right { + -webkit-transform: translateX(100%); + transform: translateX(100%); } + @supports ((-webkit-transform-style: preserve-3d) or (transform-style: preserve-3d)) { + .carousel-item-next, + .active.carousel-item-right { + -webkit-transform: translate3d(100%, 0, 0); + transform: translate3d(100%, 0, 0); } } + +.carousel-item-prev, +.active.carousel-item-left { + -webkit-transform: translateX(-100%); + transform: translateX(-100%); } + @supports ((-webkit-transform-style: preserve-3d) or (transform-style: preserve-3d)) { + .carousel-item-prev, + .active.carousel-item-left { + -webkit-transform: translate3d(-100%, 0, 0); + transform: translate3d(-100%, 0, 0); } } + +.carousel-control-prev, +.carousel-control-next { + position: absolute; + top: 0; + bottom: 0; + display: flex; + align-items: center; + justify-content: center; + width: 15%; + color: #fff; + text-align: center; + opacity: 0.5; } + .carousel-control-prev:hover, .carousel-control-prev:focus, + .carousel-control-next:hover, + .carousel-control-next:focus { + color: #fff; + text-decoration: none; + outline: 0; + opacity: .9; } + +.carousel-control-prev { + left: 0; } + +.carousel-control-next { + right: 0; } + +.carousel-control-prev-icon, +.carousel-control-next-icon { + display: inline-block; + width: 20px; + height: 20px; + background: transparent no-repeat center center; + background-size: 100% 100%; } + +.carousel-control-prev-icon { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3E%3Cpath d='M5.25 0l-4 4 4 4 1.5-1.5-2.5-2.5 2.5-2.5-1.5-1.5z'/%3E%3C/svg%3E"); } + +.carousel-control-next-icon { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg xmlns='http://www.w3.org/2000/svg' fill='%23fff' viewBox='0 0 8 8'%3E%3Cpath d='M2.75 0l-1.5 1.5 2.5 2.5-2.5 2.5 1.5 1.5 4-4-4-4z'/%3E%3C/svg%3E"); } + +.carousel-indicators { + position: absolute; + right: 0; + bottom: 10px; + left: 0; + z-index: 15; + display: flex; + justify-content: center; + padding-left: 0; + margin-right: 15%; + margin-left: 15%; + list-style: none; } + .carousel-indicators li { + position: relative; + flex: 0 1 auto; + width: 30px; + height: 3px; + margin-right: 3px; + margin-left: 3px; + text-indent: -999px; + background-color: rgba(255, 255, 255, 0.5); } + .carousel-indicators li::before { + position: absolute; + top: -10px; + left: 0; + display: inline-block; + width: 100%; + height: 10px; + content: ""; } + .carousel-indicators li::after { + position: absolute; + bottom: -10px; + left: 0; + display: inline-block; + width: 100%; + height: 10px; + content: ""; } + .carousel-indicators .active { + background-color: #fff; } + +.carousel-caption { + position: absolute; + right: 15%; + bottom: 20px; + left: 15%; + z-index: 10; + padding-top: 20px; + padding-bottom: 20px; + color: #fff; + text-align: center; } + +.align-baseline { + vertical-align: baseline !important; } + +.align-top { + vertical-align: top !important; } + +.align-middle { + vertical-align: middle !important; } + +.align-bottom { + vertical-align: bottom !important; } + +.align-text-bottom { + vertical-align: text-bottom !important; } + +.align-text-top { + vertical-align: text-top !important; } + +.bg-primary { + background-color: #1f3333 !important; } + +a.bg-primary:hover, a.bg-primary:focus, +button.bg-primary:hover, +button.bg-primary:focus { + background-color: #0c1313 !important; } + +.bg-secondary { + background-color: #1b807c !important; } + +a.bg-secondary:hover, a.bg-secondary:focus, +button.bg-secondary:hover, +button.bg-secondary:focus { + background-color: #125653 !important; } + +.bg-success { + background-color: #28a745 !important; } + +a.bg-success:hover, a.bg-success:focus, +button.bg-success:hover, +button.bg-success:focus { + background-color: #1e7e34 !important; } + +.bg-info { + background-color: #ccc !important; } + +a.bg-info:hover, a.bg-info:focus, +button.bg-info:hover, +button.bg-info:focus { + background-color: #b3b3b3 !important; } + +.bg-warning { + background-color: #ffc107 !important; } + +a.bg-warning:hover, a.bg-warning:focus, +button.bg-warning:hover, +button.bg-warning:focus { + background-color: #d39e00 !important; } + +.bg-danger { + background-color: #dc3545 !important; } + +a.bg-danger:hover, a.bg-danger:focus, +button.bg-danger:hover, +button.bg-danger:focus { + background-color: #bd2130 !important; } + +.bg-light { + background-color: #f3f3f3 !important; } + +a.bg-light:hover, a.bg-light:focus, +button.bg-light:hover, +button.bg-light:focus { + background-color: #dadada !important; } + +.bg-dark { + background-color: #151515 !important; } + +a.bg-dark:hover, a.bg-dark:focus, +button.bg-dark:hover, +button.bg-dark:focus { + background-color: black !important; } + +.bg-white { + background-color: #fff !important; } + +.bg-transparent { + background-color: transparent !important; } + +.border { + border: 1px solid #dee2e6 !important; } + +.border-top { + border-top: 1px solid #dee2e6 !important; } + +.border-right { + border-right: 1px solid #dee2e6 !important; } + +.border-bottom { + border-bottom: 1px solid #dee2e6 !important; } + +.border-left { + border-left: 1px solid #dee2e6 !important; } + +.border-0 { + border: 0 !important; } + +.border-top-0 { + border-top: 0 !important; } + +.border-right-0 { + border-right: 0 !important; } + +.border-bottom-0 { + border-bottom: 0 !important; } + +.border-left-0 { + border-left: 0 !important; } + +.border-primary { + border-color: #1f3333 !important; } + +.border-secondary { + border-color: #1b807c !important; } + +.border-success { + border-color: #28a745 !important; } + +.border-info { + border-color: #ccc !important; } + +.border-warning { + border-color: #ffc107 !important; } + +.border-danger { + border-color: #dc3545 !important; } + +.border-light { + border-color: #f3f3f3 !important; } + +.border-dark { + border-color: #151515 !important; } + +.border-white { + border-color: #fff !important; } + +.rounded { + border-radius: 0.25rem !important; } + +.rounded-top { + border-top-left-radius: 0.25rem !important; + border-top-right-radius: 0.25rem !important; } + +.rounded-right { + border-top-right-radius: 0.25rem !important; + border-bottom-right-radius: 0.25rem !important; } + +.rounded-bottom { + border-bottom-right-radius: 0.25rem !important; + border-bottom-left-radius: 0.25rem !important; } + +.rounded-left { + border-top-left-radius: 0.25rem !important; + border-bottom-left-radius: 0.25rem !important; } + +.rounded-circle { + border-radius: 50% !important; } + +.rounded-0 { + border-radius: 0 !important; } + +.clearfix::after { + display: block; + clear: both; + content: ""; } + +.d-none { + display: none !important; } + +.d-inline { + display: inline !important; } + +.d-inline-block { + display: inline-block !important; } + +.d-block { + display: block !important; } + +.d-table { + display: table !important; } + +.d-table-row { + display: table-row !important; } + +.d-table-cell { + display: table-cell !important; } + +.d-flex { + display: flex !important; } + +.d-inline-flex { + display: inline-flex !important; } + +@media (min-width: 576px) { + .d-sm-none { + display: none !important; } + .d-sm-inline { + display: inline !important; } + .d-sm-inline-block { + display: inline-block !important; } + .d-sm-block { + display: block !important; } + .d-sm-table { + display: table !important; } + .d-sm-table-row { + display: table-row !important; } + .d-sm-table-cell { + display: table-cell !important; } + .d-sm-flex { + display: flex !important; } + .d-sm-inline-flex { + display: inline-flex !important; } } + +@media (min-width: 768px) { + .d-md-none { + display: none !important; } + .d-md-inline { + display: inline !important; } + .d-md-inline-block { + display: inline-block !important; } + .d-md-block { + display: block !important; } + .d-md-table { + display: table !important; } + .d-md-table-row { + display: table-row !important; } + .d-md-table-cell { + display: table-cell !important; } + .d-md-flex { + display: flex !important; } + .d-md-inline-flex { + display: inline-flex !important; } } + +@media (min-width: 992px) { + .d-lg-none { + display: none !important; } + .d-lg-inline { + display: inline !important; } + .d-lg-inline-block { + display: inline-block !important; } + .d-lg-block { + display: block !important; } + .d-lg-table { + display: table !important; } + .d-lg-table-row { + display: table-row !important; } + .d-lg-table-cell { + display: table-cell !important; } + .d-lg-flex { + display: flex !important; } + .d-lg-inline-flex { + display: inline-flex !important; } } + +@media (min-width: 1200px) { + .d-xl-none { + display: none !important; } + .d-xl-inline { + display: inline !important; } + .d-xl-inline-block { + display: inline-block !important; } + .d-xl-block { + display: block !important; } + .d-xl-table { + display: table !important; } + .d-xl-table-row { + display: table-row !important; } + .d-xl-table-cell { + display: table-cell !important; } + .d-xl-flex { + display: flex !important; } + .d-xl-inline-flex { + display: inline-flex !important; } } + +@media print { + .d-print-none { + display: none !important; } + .d-print-inline { + display: inline !important; } + .d-print-inline-block { + display: inline-block !important; } + .d-print-block { + display: block !important; } + .d-print-table { + display: table !important; } + .d-print-table-row { + display: table-row !important; } + .d-print-table-cell { + display: table-cell !important; } + .d-print-flex { + display: flex !important; } + .d-print-inline-flex { + display: inline-flex !important; } } + +.embed-responsive { + position: relative; + display: block; + width: 100%; + padding: 0; + overflow: hidden; } + .embed-responsive::before { + display: block; + content: ""; } + .embed-responsive .embed-responsive-item, + .embed-responsive iframe, + .embed-responsive embed, + .embed-responsive object, + .embed-responsive video { + position: absolute; + top: 0; + bottom: 0; + left: 0; + width: 100%; + height: 100%; + border: 0; } + +.embed-responsive-21by9::before { + padding-top: 42.85714%; } + +.embed-responsive-16by9::before { + padding-top: 56.25%; } + +.embed-responsive-4by3::before { + padding-top: 75%; } + +.embed-responsive-1by1::before { + padding-top: 100%; } + +.flex-row { + flex-direction: row !important; } + +.flex-column { + flex-direction: column !important; } + +.flex-row-reverse { + flex-direction: row-reverse !important; } + +.flex-column-reverse { + flex-direction: column-reverse !important; } + +.flex-wrap { + flex-wrap: wrap !important; } + +.flex-nowrap { + flex-wrap: nowrap !important; } + +.flex-wrap-reverse { + flex-wrap: wrap-reverse !important; } + +.justify-content-start { + justify-content: flex-start !important; } + +.justify-content-end { + justify-content: flex-end !important; } + +.justify-content-center { + justify-content: center !important; } + +.justify-content-between { + justify-content: space-between !important; } + +.justify-content-around { + justify-content: space-around !important; } + +.align-items-start { + align-items: flex-start !important; } + +.align-items-end { + align-items: flex-end !important; } + +.align-items-center { + align-items: center !important; } + +.align-items-baseline { + align-items: baseline !important; } + +.align-items-stretch { + align-items: stretch !important; } + +.align-content-start { + align-content: flex-start !important; } + +.align-content-end { + align-content: flex-end !important; } + +.align-content-center { + align-content: center !important; } + +.align-content-between { + align-content: space-between !important; } + +.align-content-around { + align-content: space-around !important; } + +.align-content-stretch { + align-content: stretch !important; } + +.align-self-auto { + align-self: auto !important; } + +.align-self-start { + align-self: flex-start !important; } + +.align-self-end { + align-self: flex-end !important; } + +.align-self-center { + align-self: center !important; } + +.align-self-baseline { + align-self: baseline !important; } + +.align-self-stretch { + align-self: stretch !important; } + +@media (min-width: 576px) { + .flex-sm-row { + flex-direction: row !important; } + .flex-sm-column { + flex-direction: column !important; } + .flex-sm-row-reverse { + flex-direction: row-reverse !important; } + .flex-sm-column-reverse { + flex-direction: column-reverse !important; } + .flex-sm-wrap { + flex-wrap: wrap !important; } + .flex-sm-nowrap { + flex-wrap: nowrap !important; } + .flex-sm-wrap-reverse { + flex-wrap: wrap-reverse !important; } + .justify-content-sm-start { + justify-content: flex-start !important; } + .justify-content-sm-end { + justify-content: flex-end !important; } + .justify-content-sm-center { + justify-content: center !important; } + .justify-content-sm-between { + justify-content: space-between !important; } + .justify-content-sm-around { + justify-content: space-around !important; } + .align-items-sm-start { + align-items: flex-start !important; } + .align-items-sm-end { + align-items: flex-end !important; } + .align-items-sm-center { + align-items: center !important; } + .align-items-sm-baseline { + align-items: baseline !important; } + .align-items-sm-stretch { + align-items: stretch !important; } + .align-content-sm-start { + align-content: flex-start !important; } + .align-content-sm-end { + align-content: flex-end !important; } + .align-content-sm-center { + align-content: center !important; } + .align-content-sm-between { + align-content: space-between !important; } + .align-content-sm-around { + align-content: space-around !important; } + .align-content-sm-stretch { + align-content: stretch !important; } + .align-self-sm-auto { + align-self: auto !important; } + .align-self-sm-start { + align-self: flex-start !important; } + .align-self-sm-end { + align-self: flex-end !important; } + .align-self-sm-center { + align-self: center !important; } + .align-self-sm-baseline { + align-self: baseline !important; } + .align-self-sm-stretch { + align-self: stretch !important; } } + +@media (min-width: 768px) { + .flex-md-row { + flex-direction: row !important; } + .flex-md-column { + flex-direction: column !important; } + .flex-md-row-reverse { + flex-direction: row-reverse !important; } + .flex-md-column-reverse { + flex-direction: column-reverse !important; } + .flex-md-wrap { + flex-wrap: wrap !important; } + .flex-md-nowrap { + flex-wrap: nowrap !important; } + .flex-md-wrap-reverse { + flex-wrap: wrap-reverse !important; } + .justify-content-md-start { + justify-content: flex-start !important; } + .justify-content-md-end { + justify-content: flex-end !important; } + .justify-content-md-center { + justify-content: center !important; } + .justify-content-md-between { + justify-content: space-between !important; } + .justify-content-md-around { + justify-content: space-around !important; } + .align-items-md-start { + align-items: flex-start !important; } + .align-items-md-end { + align-items: flex-end !important; } + .align-items-md-center { + align-items: center !important; } + .align-items-md-baseline { + align-items: baseline !important; } + .align-items-md-stretch { + align-items: stretch !important; } + .align-content-md-start { + align-content: flex-start !important; } + .align-content-md-end { + align-content: flex-end !important; } + .align-content-md-center { + align-content: center !important; } + .align-content-md-between { + align-content: space-between !important; } + .align-content-md-around { + align-content: space-around !important; } + .align-content-md-stretch { + align-content: stretch !important; } + .align-self-md-auto { + align-self: auto !important; } + .align-self-md-start { + align-self: flex-start !important; } + .align-self-md-end { + align-self: flex-end !important; } + .align-self-md-center { + align-self: center !important; } + .align-self-md-baseline { + align-self: baseline !important; } + .align-self-md-stretch { + align-self: stretch !important; } } + +@media (min-width: 992px) { + .flex-lg-row { + flex-direction: row !important; } + .flex-lg-column { + flex-direction: column !important; } + .flex-lg-row-reverse { + flex-direction: row-reverse !important; } + .flex-lg-column-reverse { + flex-direction: column-reverse !important; } + .flex-lg-wrap { + flex-wrap: wrap !important; } + .flex-lg-nowrap { + flex-wrap: nowrap !important; } + .flex-lg-wrap-reverse { + flex-wrap: wrap-reverse !important; } + .justify-content-lg-start { + justify-content: flex-start !important; } + .justify-content-lg-end { + justify-content: flex-end !important; } + .justify-content-lg-center { + justify-content: center !important; } + .justify-content-lg-between { + justify-content: space-between !important; } + .justify-content-lg-around { + justify-content: space-around !important; } + .align-items-lg-start { + align-items: flex-start !important; } + .align-items-lg-end { + align-items: flex-end !important; } + .align-items-lg-center { + align-items: center !important; } + .align-items-lg-baseline { + align-items: baseline !important; } + .align-items-lg-stretch { + align-items: stretch !important; } + .align-content-lg-start { + align-content: flex-start !important; } + .align-content-lg-end { + align-content: flex-end !important; } + .align-content-lg-center { + align-content: center !important; } + .align-content-lg-between { + align-content: space-between !important; } + .align-content-lg-around { + align-content: space-around !important; } + .align-content-lg-stretch { + align-content: stretch !important; } + .align-self-lg-auto { + align-self: auto !important; } + .align-self-lg-start { + align-self: flex-start !important; } + .align-self-lg-end { + align-self: flex-end !important; } + .align-self-lg-center { + align-self: center !important; } + .align-self-lg-baseline { + align-self: baseline !important; } + .align-self-lg-stretch { + align-self: stretch !important; } } + +@media (min-width: 1200px) { + .flex-xl-row { + flex-direction: row !important; } + .flex-xl-column { + flex-direction: column !important; } + .flex-xl-row-reverse { + flex-direction: row-reverse !important; } + .flex-xl-column-reverse { + flex-direction: column-reverse !important; } + .flex-xl-wrap { + flex-wrap: wrap !important; } + .flex-xl-nowrap { + flex-wrap: nowrap !important; } + .flex-xl-wrap-reverse { + flex-wrap: wrap-reverse !important; } + .justify-content-xl-start { + justify-content: flex-start !important; } + .justify-content-xl-end { + justify-content: flex-end !important; } + .justify-content-xl-center { + justify-content: center !important; } + .justify-content-xl-between { + justify-content: space-between !important; } + .justify-content-xl-around { + justify-content: space-around !important; } + .align-items-xl-start { + align-items: flex-start !important; } + .align-items-xl-end { + align-items: flex-end !important; } + .align-items-xl-center { + align-items: center !important; } + .align-items-xl-baseline { + align-items: baseline !important; } + .align-items-xl-stretch { + align-items: stretch !important; } + .align-content-xl-start { + align-content: flex-start !important; } + .align-content-xl-end { + align-content: flex-end !important; } + .align-content-xl-center { + align-content: center !important; } + .align-content-xl-between { + align-content: space-between !important; } + .align-content-xl-around { + align-content: space-around !important; } + .align-content-xl-stretch { + align-content: stretch !important; } + .align-self-xl-auto { + align-self: auto !important; } + .align-self-xl-start { + align-self: flex-start !important; } + .align-self-xl-end { + align-self: flex-end !important; } + .align-self-xl-center { + align-self: center !important; } + .align-self-xl-baseline { + align-self: baseline !important; } + .align-self-xl-stretch { + align-self: stretch !important; } } + +.float-left { + float: left !important; } + +.float-right { + float: right !important; } + +.float-none { + float: none !important; } + +@media (min-width: 576px) { + .float-sm-left { + float: left !important; } + .float-sm-right { + float: right !important; } + .float-sm-none { + float: none !important; } } + +@media (min-width: 768px) { + .float-md-left { + float: left !important; } + .float-md-right { + float: right !important; } + .float-md-none { + float: none !important; } } + +@media (min-width: 992px) { + .float-lg-left { + float: left !important; } + .float-lg-right { + float: right !important; } + .float-lg-none { + float: none !important; } } + +@media (min-width: 1200px) { + .float-xl-left { + float: left !important; } + .float-xl-right { + float: right !important; } + .float-xl-none { + float: none !important; } } + +.position-static { + position: static !important; } + +.position-relative { + position: relative !important; } + +.position-absolute { + position: absolute !important; } + +.position-fixed { + position: fixed !important; } + +.position-sticky { + position: -webkit-sticky !important; + position: sticky !important; } + +.fixed-top { + position: fixed; + top: 0; + right: 0; + left: 0; + z-index: 1030; } + +.fixed-bottom { + position: fixed; + right: 0; + bottom: 0; + left: 0; + z-index: 1030; } + +@supports ((position: -webkit-sticky) or (position: sticky)) { + .sticky-top { + position: -webkit-sticky; + position: sticky; + top: 0; + z-index: 1020; } } + +.sr-only { + position: absolute; + width: 1px; + height: 1px; + padding: 0; + overflow: hidden; + clip: rect(0, 0, 0, 0); + white-space: nowrap; + -webkit-clip-path: inset(50%); + clip-path: inset(50%); + border: 0; } + +.sr-only-focusable:active, .sr-only-focusable:focus { + position: static; + width: auto; + height: auto; + overflow: visible; + clip: auto; + white-space: normal; + -webkit-clip-path: none; + clip-path: none; } + +.w-25 { + width: 25% !important; } + +.w-50 { + width: 50% !important; } + +.w-75 { + width: 75% !important; } + +.w-100 { + width: 100% !important; } + +.h-25 { + height: 25% !important; } + +.h-50 { + height: 50% !important; } + +.h-75 { + height: 75% !important; } + +.h-100 { + height: 100% !important; } + +.mw-100 { + max-width: 100% !important; } + +.mh-100 { + max-height: 100% !important; } + +.m-0 { + margin: 0 !important; } + +.mt-0, +.my-0 { + margin-top: 0 !important; } + +.mr-0, +.mx-0 { + margin-right: 0 !important; } + +.mb-0, +.my-0 { + margin-bottom: 0 !important; } + +.ml-0, +.mx-0 { + margin-left: 0 !important; } + +.m-1 { + margin: 0.375rem !important; } + +.mt-1, +.my-1 { + margin-top: 0.375rem !important; } + +.mr-1, +.mx-1 { + margin-right: 0.375rem !important; } + +.mb-1, +.my-1 { + margin-bottom: 0.375rem !important; } + +.ml-1, +.mx-1 { + margin-left: 0.375rem !important; } + +.m-2 { + margin: 0.75rem !important; } + +.mt-2, +.my-2 { + margin-top: 0.75rem !important; } + +.mr-2, +.mx-2 { + margin-right: 0.75rem !important; } + +.mb-2, +.my-2 { + margin-bottom: 0.75rem !important; } + +.ml-2, +.mx-2 { + margin-left: 0.75rem !important; } + +.m-3 { + margin: 1.5rem !important; } + +.mt-3, +.my-3 { + margin-top: 1.5rem !important; } + +.mr-3, +.mx-3 { + margin-right: 1.5rem !important; } + +.mb-3, +.my-3 { + margin-bottom: 1.5rem !important; } + +.ml-3, +.mx-3 { + margin-left: 1.5rem !important; } + +.m-4 { + margin: 2.25rem !important; } + +.mt-4, +.my-4 { + margin-top: 2.25rem !important; } + +.mr-4, +.mx-4 { + margin-right: 2.25rem !important; } + +.mb-4, +.my-4 { + margin-bottom: 2.25rem !important; } + +.ml-4, +.mx-4 { + margin-left: 2.25rem !important; } + +.m-5 { + margin: 4.5rem !important; } + +.mt-5, +.my-5 { + margin-top: 4.5rem !important; } + +.mr-5, +.mx-5 { + margin-right: 4.5rem !important; } + +.mb-5, +.my-5 { + margin-bottom: 4.5rem !important; } + +.ml-5, +.mx-5 { + margin-left: 4.5rem !important; } + +.p-0 { + padding: 0 !important; } + +.pt-0, +.py-0 { + padding-top: 0 !important; } + +.pr-0, +.px-0 { + padding-right: 0 !important; } + +.pb-0, +.py-0 { + padding-bottom: 0 !important; } + +.pl-0, +.px-0 { + padding-left: 0 !important; } + +.p-1 { + padding: 0.375rem !important; } + +.pt-1, +.py-1 { + padding-top: 0.375rem !important; } + +.pr-1, +.px-1 { + padding-right: 0.375rem !important; } + +.pb-1, +.py-1 { + padding-bottom: 0.375rem !important; } + +.pl-1, +.px-1 { + padding-left: 0.375rem !important; } + +.p-2 { + padding: 0.75rem !important; } + +.pt-2, +.py-2 { + padding-top: 0.75rem !important; } + +.pr-2, +.px-2 { + padding-right: 0.75rem !important; } + +.pb-2, +.py-2 { + padding-bottom: 0.75rem !important; } + +.pl-2, +.px-2 { + padding-left: 0.75rem !important; } + +.p-3 { + padding: 1.5rem !important; } + +.pt-3, +.py-3 { + padding-top: 1.5rem !important; } + +.pr-3, +.px-3 { + padding-right: 1.5rem !important; } + +.pb-3, +.py-3 { + padding-bottom: 1.5rem !important; } + +.pl-3, +.px-3 { + padding-left: 1.5rem !important; } + +.p-4 { + padding: 2.25rem !important; } + +.pt-4, +.py-4 { + padding-top: 2.25rem !important; } + +.pr-4, +.px-4 { + padding-right: 2.25rem !important; } + +.pb-4, +.py-4 { + padding-bottom: 2.25rem !important; } + +.pl-4, +.px-4 { + padding-left: 2.25rem !important; } + +.p-5 { + padding: 4.5rem !important; } + +.pt-5, +.py-5 { + padding-top: 4.5rem !important; } + +.pr-5, +.px-5 { + padding-right: 4.5rem !important; } + +.pb-5, +.py-5 { + padding-bottom: 4.5rem !important; } + +.pl-5, +.px-5 { + padding-left: 4.5rem !important; } + +.m-auto { + margin: auto !important; } + +.mt-auto, +.my-auto { + margin-top: auto !important; } + +.mr-auto, +.mx-auto { + margin-right: auto !important; } + +.mb-auto, +.my-auto { + margin-bottom: auto !important; } + +.ml-auto, +.mx-auto { + margin-left: auto !important; } + +@media (min-width: 576px) { + .m-sm-0 { + margin: 0 !important; } + .mt-sm-0, + .my-sm-0 { + margin-top: 0 !important; } + .mr-sm-0, + .mx-sm-0 { + margin-right: 0 !important; } + .mb-sm-0, + .my-sm-0 { + margin-bottom: 0 !important; } + .ml-sm-0, + .mx-sm-0 { + margin-left: 0 !important; } + .m-sm-1 { + margin: 0.375rem !important; } + .mt-sm-1, + .my-sm-1 { + margin-top: 0.375rem !important; } + .mr-sm-1, + .mx-sm-1 { + margin-right: 0.375rem !important; } + .mb-sm-1, + .my-sm-1 { + margin-bottom: 0.375rem !important; } + .ml-sm-1, + .mx-sm-1 { + margin-left: 0.375rem !important; } + .m-sm-2 { + margin: 0.75rem !important; } + .mt-sm-2, + .my-sm-2 { + margin-top: 0.75rem !important; } + .mr-sm-2, + .mx-sm-2 { + margin-right: 0.75rem !important; } + .mb-sm-2, + .my-sm-2 { + margin-bottom: 0.75rem !important; } + .ml-sm-2, + .mx-sm-2 { + margin-left: 0.75rem !important; } + .m-sm-3 { + margin: 1.5rem !important; } + .mt-sm-3, + .my-sm-3 { + margin-top: 1.5rem !important; } + .mr-sm-3, + .mx-sm-3 { + margin-right: 1.5rem !important; } + .mb-sm-3, + .my-sm-3 { + margin-bottom: 1.5rem !important; } + .ml-sm-3, + .mx-sm-3 { + margin-left: 1.5rem !important; } + .m-sm-4 { + margin: 2.25rem !important; } + .mt-sm-4, + .my-sm-4 { + margin-top: 2.25rem !important; } + .mr-sm-4, + .mx-sm-4 { + margin-right: 2.25rem !important; } + .mb-sm-4, + .my-sm-4 { + margin-bottom: 2.25rem !important; } + .ml-sm-4, + .mx-sm-4 { + margin-left: 2.25rem !important; } + .m-sm-5 { + margin: 4.5rem !important; } + .mt-sm-5, + .my-sm-5 { + margin-top: 4.5rem !important; } + .mr-sm-5, + .mx-sm-5 { + margin-right: 4.5rem !important; } + .mb-sm-5, + .my-sm-5 { + margin-bottom: 4.5rem !important; } + .ml-sm-5, + .mx-sm-5 { + margin-left: 4.5rem !important; } + .p-sm-0 { + padding: 0 !important; } + .pt-sm-0, + .py-sm-0 { + padding-top: 0 !important; } + .pr-sm-0, + .px-sm-0 { + padding-right: 0 !important; } + .pb-sm-0, + .py-sm-0 { + padding-bottom: 0 !important; } + .pl-sm-0, + .px-sm-0 { + padding-left: 0 !important; } + .p-sm-1 { + padding: 0.375rem !important; } + .pt-sm-1, + .py-sm-1 { + padding-top: 0.375rem !important; } + .pr-sm-1, + .px-sm-1 { + padding-right: 0.375rem !important; } + .pb-sm-1, + .py-sm-1 { + padding-bottom: 0.375rem !important; } + .pl-sm-1, + .px-sm-1 { + padding-left: 0.375rem !important; } + .p-sm-2 { + padding: 0.75rem !important; } + .pt-sm-2, + .py-sm-2 { + padding-top: 0.75rem !important; } + .pr-sm-2, + .px-sm-2 { + padding-right: 0.75rem !important; } + .pb-sm-2, + .py-sm-2 { + padding-bottom: 0.75rem !important; } + .pl-sm-2, + .px-sm-2 { + padding-left: 0.75rem !important; } + .p-sm-3 { + padding: 1.5rem !important; } + .pt-sm-3, + .py-sm-3 { + padding-top: 1.5rem !important; } + .pr-sm-3, + .px-sm-3 { + padding-right: 1.5rem !important; } + .pb-sm-3, + .py-sm-3 { + padding-bottom: 1.5rem !important; } + .pl-sm-3, + .px-sm-3 { + padding-left: 1.5rem !important; } + .p-sm-4 { + padding: 2.25rem !important; } + .pt-sm-4, + .py-sm-4 { + padding-top: 2.25rem !important; } + .pr-sm-4, + .px-sm-4 { + padding-right: 2.25rem !important; } + .pb-sm-4, + .py-sm-4 { + padding-bottom: 2.25rem !important; } + .pl-sm-4, + .px-sm-4 { + padding-left: 2.25rem !important; } + .p-sm-5 { + padding: 4.5rem !important; } + .pt-sm-5, + .py-sm-5 { + padding-top: 4.5rem !important; } + .pr-sm-5, + .px-sm-5 { + padding-right: 4.5rem !important; } + .pb-sm-5, + .py-sm-5 { + padding-bottom: 4.5rem !important; } + .pl-sm-5, + .px-sm-5 { + padding-left: 4.5rem !important; } + .m-sm-auto { + margin: auto !important; } + .mt-sm-auto, + .my-sm-auto { + margin-top: auto !important; } + .mr-sm-auto, + .mx-sm-auto { + margin-right: auto !important; } + .mb-sm-auto, + .my-sm-auto { + margin-bottom: auto !important; } + .ml-sm-auto, + .mx-sm-auto { + margin-left: auto !important; } } + +@media (min-width: 768px) { + .m-md-0 { + margin: 0 !important; } + .mt-md-0, + .my-md-0 { + margin-top: 0 !important; } + .mr-md-0, + .mx-md-0 { + margin-right: 0 !important; } + .mb-md-0, + .my-md-0 { + margin-bottom: 0 !important; } + .ml-md-0, + .mx-md-0 { + margin-left: 0 !important; } + .m-md-1 { + margin: 0.375rem !important; } + .mt-md-1, + .my-md-1 { + margin-top: 0.375rem !important; } + .mr-md-1, + .mx-md-1 { + margin-right: 0.375rem !important; } + .mb-md-1, + .my-md-1 { + margin-bottom: 0.375rem !important; } + .ml-md-1, + .mx-md-1 { + margin-left: 0.375rem !important; } + .m-md-2 { + margin: 0.75rem !important; } + .mt-md-2, + .my-md-2 { + margin-top: 0.75rem !important; } + .mr-md-2, + .mx-md-2 { + margin-right: 0.75rem !important; } + .mb-md-2, + .my-md-2 { + margin-bottom: 0.75rem !important; } + .ml-md-2, + .mx-md-2 { + margin-left: 0.75rem !important; } + .m-md-3 { + margin: 1.5rem !important; } + .mt-md-3, + .my-md-3 { + margin-top: 1.5rem !important; } + .mr-md-3, + .mx-md-3 { + margin-right: 1.5rem !important; } + .mb-md-3, + .my-md-3 { + margin-bottom: 1.5rem !important; } + .ml-md-3, + .mx-md-3 { + margin-left: 1.5rem !important; } + .m-md-4 { + margin: 2.25rem !important; } + .mt-md-4, + .my-md-4 { + margin-top: 2.25rem !important; } + .mr-md-4, + .mx-md-4 { + margin-right: 2.25rem !important; } + .mb-md-4, + .my-md-4 { + margin-bottom: 2.25rem !important; } + .ml-md-4, + .mx-md-4 { + margin-left: 2.25rem !important; } + .m-md-5 { + margin: 4.5rem !important; } + .mt-md-5, + .my-md-5 { + margin-top: 4.5rem !important; } + .mr-md-5, + .mx-md-5 { + margin-right: 4.5rem !important; } + .mb-md-5, + .my-md-5 { + margin-bottom: 4.5rem !important; } + .ml-md-5, + .mx-md-5 { + margin-left: 4.5rem !important; } + .p-md-0 { + padding: 0 !important; } + .pt-md-0, + .py-md-0 { + padding-top: 0 !important; } + .pr-md-0, + .px-md-0 { + padding-right: 0 !important; } + .pb-md-0, + .py-md-0 { + padding-bottom: 0 !important; } + .pl-md-0, + .px-md-0 { + padding-left: 0 !important; } + .p-md-1 { + padding: 0.375rem !important; } + .pt-md-1, + .py-md-1 { + padding-top: 0.375rem !important; } + .pr-md-1, + .px-md-1 { + padding-right: 0.375rem !important; } + .pb-md-1, + .py-md-1 { + padding-bottom: 0.375rem !important; } + .pl-md-1, + .px-md-1 { + padding-left: 0.375rem !important; } + .p-md-2 { + padding: 0.75rem !important; } + .pt-md-2, + .py-md-2 { + padding-top: 0.75rem !important; } + .pr-md-2, + .px-md-2 { + padding-right: 0.75rem !important; } + .pb-md-2, + .py-md-2 { + padding-bottom: 0.75rem !important; } + .pl-md-2, + .px-md-2 { + padding-left: 0.75rem !important; } + .p-md-3 { + padding: 1.5rem !important; } + .pt-md-3, + .py-md-3 { + padding-top: 1.5rem !important; } + .pr-md-3, + .px-md-3 { + padding-right: 1.5rem !important; } + .pb-md-3, + .py-md-3 { + padding-bottom: 1.5rem !important; } + .pl-md-3, + .px-md-3 { + padding-left: 1.5rem !important; } + .p-md-4 { + padding: 2.25rem !important; } + .pt-md-4, + .py-md-4 { + padding-top: 2.25rem !important; } + .pr-md-4, + .px-md-4 { + padding-right: 2.25rem !important; } + .pb-md-4, + .py-md-4 { + padding-bottom: 2.25rem !important; } + .pl-md-4, + .px-md-4 { + padding-left: 2.25rem !important; } + .p-md-5 { + padding: 4.5rem !important; } + .pt-md-5, + .py-md-5 { + padding-top: 4.5rem !important; } + .pr-md-5, + .px-md-5 { + padding-right: 4.5rem !important; } + .pb-md-5, + .py-md-5 { + padding-bottom: 4.5rem !important; } + .pl-md-5, + .px-md-5 { + padding-left: 4.5rem !important; } + .m-md-auto { + margin: auto !important; } + .mt-md-auto, + .my-md-auto { + margin-top: auto !important; } + .mr-md-auto, + .mx-md-auto { + margin-right: auto !important; } + .mb-md-auto, + .my-md-auto { + margin-bottom: auto !important; } + .ml-md-auto, + .mx-md-auto { + margin-left: auto !important; } } + +@media (min-width: 992px) { + .m-lg-0 { + margin: 0 !important; } + .mt-lg-0, + .my-lg-0 { + margin-top: 0 !important; } + .mr-lg-0, + .mx-lg-0 { + margin-right: 0 !important; } + .mb-lg-0, + .my-lg-0 { + margin-bottom: 0 !important; } + .ml-lg-0, + .mx-lg-0 { + margin-left: 0 !important; } + .m-lg-1 { + margin: 0.375rem !important; } + .mt-lg-1, + .my-lg-1 { + margin-top: 0.375rem !important; } + .mr-lg-1, + .mx-lg-1 { + margin-right: 0.375rem !important; } + .mb-lg-1, + .my-lg-1 { + margin-bottom: 0.375rem !important; } + .ml-lg-1, + .mx-lg-1 { + margin-left: 0.375rem !important; } + .m-lg-2 { + margin: 0.75rem !important; } + .mt-lg-2, + .my-lg-2 { + margin-top: 0.75rem !important; } + .mr-lg-2, + .mx-lg-2 { + margin-right: 0.75rem !important; } + .mb-lg-2, + .my-lg-2 { + margin-bottom: 0.75rem !important; } + .ml-lg-2, + .mx-lg-2 { + margin-left: 0.75rem !important; } + .m-lg-3 { + margin: 1.5rem !important; } + .mt-lg-3, + .my-lg-3 { + margin-top: 1.5rem !important; } + .mr-lg-3, + .mx-lg-3 { + margin-right: 1.5rem !important; } + .mb-lg-3, + .my-lg-3 { + margin-bottom: 1.5rem !important; } + .ml-lg-3, + .mx-lg-3 { + margin-left: 1.5rem !important; } + .m-lg-4 { + margin: 2.25rem !important; } + .mt-lg-4, + .my-lg-4 { + margin-top: 2.25rem !important; } + .mr-lg-4, + .mx-lg-4 { + margin-right: 2.25rem !important; } + .mb-lg-4, + .my-lg-4 { + margin-bottom: 2.25rem !important; } + .ml-lg-4, + .mx-lg-4 { + margin-left: 2.25rem !important; } + .m-lg-5 { + margin: 4.5rem !important; } + .mt-lg-5, + .my-lg-5 { + margin-top: 4.5rem !important; } + .mr-lg-5, + .mx-lg-5 { + margin-right: 4.5rem !important; } + .mb-lg-5, + .my-lg-5 { + margin-bottom: 4.5rem !important; } + .ml-lg-5, + .mx-lg-5 { + margin-left: 4.5rem !important; } + .p-lg-0 { + padding: 0 !important; } + .pt-lg-0, + .py-lg-0 { + padding-top: 0 !important; } + .pr-lg-0, + .px-lg-0 { + padding-right: 0 !important; } + .pb-lg-0, + .py-lg-0 { + padding-bottom: 0 !important; } + .pl-lg-0, + .px-lg-0 { + padding-left: 0 !important; } + .p-lg-1 { + padding: 0.375rem !important; } + .pt-lg-1, + .py-lg-1 { + padding-top: 0.375rem !important; } + .pr-lg-1, + .px-lg-1 { + padding-right: 0.375rem !important; } + .pb-lg-1, + .py-lg-1 { + padding-bottom: 0.375rem !important; } + .pl-lg-1, + .px-lg-1 { + padding-left: 0.375rem !important; } + .p-lg-2 { + padding: 0.75rem !important; } + .pt-lg-2, + .py-lg-2 { + padding-top: 0.75rem !important; } + .pr-lg-2, + .px-lg-2 { + padding-right: 0.75rem !important; } + .pb-lg-2, + .py-lg-2 { + padding-bottom: 0.75rem !important; } + .pl-lg-2, + .px-lg-2 { + padding-left: 0.75rem !important; } + .p-lg-3 { + padding: 1.5rem !important; } + .pt-lg-3, + .py-lg-3 { + padding-top: 1.5rem !important; } + .pr-lg-3, + .px-lg-3 { + padding-right: 1.5rem !important; } + .pb-lg-3, + .py-lg-3 { + padding-bottom: 1.5rem !important; } + .pl-lg-3, + .px-lg-3 { + padding-left: 1.5rem !important; } + .p-lg-4 { + padding: 2.25rem !important; } + .pt-lg-4, + .py-lg-4 { + padding-top: 2.25rem !important; } + .pr-lg-4, + .px-lg-4 { + padding-right: 2.25rem !important; } + .pb-lg-4, + .py-lg-4 { + padding-bottom: 2.25rem !important; } + .pl-lg-4, + .px-lg-4 { + padding-left: 2.25rem !important; } + .p-lg-5 { + padding: 4.5rem !important; } + .pt-lg-5, + .py-lg-5 { + padding-top: 4.5rem !important; } + .pr-lg-5, + .px-lg-5 { + padding-right: 4.5rem !important; } + .pb-lg-5, + .py-lg-5 { + padding-bottom: 4.5rem !important; } + .pl-lg-5, + .px-lg-5 { + padding-left: 4.5rem !important; } + .m-lg-auto { + margin: auto !important; } + .mt-lg-auto, + .my-lg-auto { + margin-top: auto !important; } + .mr-lg-auto, + .mx-lg-auto { + margin-right: auto !important; } + .mb-lg-auto, + .my-lg-auto { + margin-bottom: auto !important; } + .ml-lg-auto, + .mx-lg-auto { + margin-left: auto !important; } } + +@media (min-width: 1200px) { + .m-xl-0 { + margin: 0 !important; } + .mt-xl-0, + .my-xl-0 { + margin-top: 0 !important; } + .mr-xl-0, + .mx-xl-0 { + margin-right: 0 !important; } + .mb-xl-0, + .my-xl-0 { + margin-bottom: 0 !important; } + .ml-xl-0, + .mx-xl-0 { + margin-left: 0 !important; } + .m-xl-1 { + margin: 0.375rem !important; } + .mt-xl-1, + .my-xl-1 { + margin-top: 0.375rem !important; } + .mr-xl-1, + .mx-xl-1 { + margin-right: 0.375rem !important; } + .mb-xl-1, + .my-xl-1 { + margin-bottom: 0.375rem !important; } + .ml-xl-1, + .mx-xl-1 { + margin-left: 0.375rem !important; } + .m-xl-2 { + margin: 0.75rem !important; } + .mt-xl-2, + .my-xl-2 { + margin-top: 0.75rem !important; } + .mr-xl-2, + .mx-xl-2 { + margin-right: 0.75rem !important; } + .mb-xl-2, + .my-xl-2 { + margin-bottom: 0.75rem !important; } + .ml-xl-2, + .mx-xl-2 { + margin-left: 0.75rem !important; } + .m-xl-3 { + margin: 1.5rem !important; } + .mt-xl-3, + .my-xl-3 { + margin-top: 1.5rem !important; } + .mr-xl-3, + .mx-xl-3 { + margin-right: 1.5rem !important; } + .mb-xl-3, + .my-xl-3 { + margin-bottom: 1.5rem !important; } + .ml-xl-3, + .mx-xl-3 { + margin-left: 1.5rem !important; } + .m-xl-4 { + margin: 2.25rem !important; } + .mt-xl-4, + .my-xl-4 { + margin-top: 2.25rem !important; } + .mr-xl-4, + .mx-xl-4 { + margin-right: 2.25rem !important; } + .mb-xl-4, + .my-xl-4 { + margin-bottom: 2.25rem !important; } + .ml-xl-4, + .mx-xl-4 { + margin-left: 2.25rem !important; } + .m-xl-5 { + margin: 4.5rem !important; } + .mt-xl-5, + .my-xl-5 { + margin-top: 4.5rem !important; } + .mr-xl-5, + .mx-xl-5 { + margin-right: 4.5rem !important; } + .mb-xl-5, + .my-xl-5 { + margin-bottom: 4.5rem !important; } + .ml-xl-5, + .mx-xl-5 { + margin-left: 4.5rem !important; } + .p-xl-0 { + padding: 0 !important; } + .pt-xl-0, + .py-xl-0 { + padding-top: 0 !important; } + .pr-xl-0, + .px-xl-0 { + padding-right: 0 !important; } + .pb-xl-0, + .py-xl-0 { + padding-bottom: 0 !important; } + .pl-xl-0, + .px-xl-0 { + padding-left: 0 !important; } + .p-xl-1 { + padding: 0.375rem !important; } + .pt-xl-1, + .py-xl-1 { + padding-top: 0.375rem !important; } + .pr-xl-1, + .px-xl-1 { + padding-right: 0.375rem !important; } + .pb-xl-1, + .py-xl-1 { + padding-bottom: 0.375rem !important; } + .pl-xl-1, + .px-xl-1 { + padding-left: 0.375rem !important; } + .p-xl-2 { + padding: 0.75rem !important; } + .pt-xl-2, + .py-xl-2 { + padding-top: 0.75rem !important; } + .pr-xl-2, + .px-xl-2 { + padding-right: 0.75rem !important; } + .pb-xl-2, + .py-xl-2 { + padding-bottom: 0.75rem !important; } + .pl-xl-2, + .px-xl-2 { + padding-left: 0.75rem !important; } + .p-xl-3 { + padding: 1.5rem !important; } + .pt-xl-3, + .py-xl-3 { + padding-top: 1.5rem !important; } + .pr-xl-3, + .px-xl-3 { + padding-right: 1.5rem !important; } + .pb-xl-3, + .py-xl-3 { + padding-bottom: 1.5rem !important; } + .pl-xl-3, + .px-xl-3 { + padding-left: 1.5rem !important; } + .p-xl-4 { + padding: 2.25rem !important; } + .pt-xl-4, + .py-xl-4 { + padding-top: 2.25rem !important; } + .pr-xl-4, + .px-xl-4 { + padding-right: 2.25rem !important; } + .pb-xl-4, + .py-xl-4 { + padding-bottom: 2.25rem !important; } + .pl-xl-4, + .px-xl-4 { + padding-left: 2.25rem !important; } + .p-xl-5 { + padding: 4.5rem !important; } + .pt-xl-5, + .py-xl-5 { + padding-top: 4.5rem !important; } + .pr-xl-5, + .px-xl-5 { + padding-right: 4.5rem !important; } + .pb-xl-5, + .py-xl-5 { + padding-bottom: 4.5rem !important; } + .pl-xl-5, + .px-xl-5 { + padding-left: 4.5rem !important; } + .m-xl-auto { + margin: auto !important; } + .mt-xl-auto, + .my-xl-auto { + margin-top: auto !important; } + .mr-xl-auto, + .mx-xl-auto { + margin-right: auto !important; } + .mb-xl-auto, + .my-xl-auto { + margin-bottom: auto !important; } + .ml-xl-auto, + .mx-xl-auto { + margin-left: auto !important; } } + +.text-justify { + text-align: justify !important; } + +.text-nowrap { + white-space: nowrap !important; } + +.text-truncate { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; } + +.text-left { + text-align: left !important; } + +.text-right { + text-align: right !important; } + +.text-center { + text-align: center !important; } + +@media (min-width: 576px) { + .text-sm-left { + text-align: left !important; } + .text-sm-right { + text-align: right !important; } + .text-sm-center { + text-align: center !important; } } + +@media (min-width: 768px) { + .text-md-left { + text-align: left !important; } + .text-md-right { + text-align: right !important; } + .text-md-center { + text-align: center !important; } } + +@media (min-width: 992px) { + .text-lg-left { + text-align: left !important; } + .text-lg-right { + text-align: right !important; } + .text-lg-center { + text-align: center !important; } } + +@media (min-width: 1200px) { + .text-xl-left { + text-align: left !important; } + .text-xl-right { + text-align: right !important; } + .text-xl-center { + text-align: center !important; } } + +.text-lowercase { + text-transform: lowercase !important; } + +.text-uppercase { + text-transform: uppercase !important; } + +.text-capitalize { + text-transform: capitalize !important; } + +.font-weight-light { + font-weight: 300 !important; } + +.font-weight-normal { + font-weight: 200 !important; } + +.font-weight-bold { + font-weight: 700 !important; } + +.font-italic { + font-style: italic !important; } + +.text-white { + color: #fff !important; } + +.text-primary { + color: #1f3333 !important; } + +a.text-primary:hover, a.text-primary:focus { + color: #0c1313 !important; } + +.text-secondary { + color: #1b807c !important; } + +a.text-secondary:hover, a.text-secondary:focus { + color: #125653 !important; } + +.text-success { + color: #28a745 !important; } + +a.text-success:hover, a.text-success:focus { + color: #1e7e34 !important; } + +.text-info { + color: #ccc !important; } + +a.text-info:hover, a.text-info:focus { + color: #b3b3b3 !important; } + +.text-warning { + color: #ffc107 !important; } + +a.text-warning:hover, a.text-warning:focus { + color: #d39e00 !important; } + +.text-danger { + color: #dc3545 !important; } + +a.text-danger:hover, a.text-danger:focus { + color: #bd2130 !important; } + +.text-light { + color: #f3f3f3 !important; } + +a.text-light:hover, a.text-light:focus { + color: #dadada !important; } + +.text-dark { + color: #151515 !important; } + +a.text-dark:hover, a.text-dark:focus { + color: black !important; } + +.text-muted { + color: #6c757d !important; } + +.text-hide { + font: 0/0 a; + color: transparent; + text-shadow: none; + background-color: transparent; + border: 0; } + +.visible { + visibility: visible !important; } + +.invisible { + visibility: hidden !important; } + +@media print { + *, + *::before, + *::after { + text-shadow: none !important; + box-shadow: none !important; } + a:not(.btn) { + text-decoration: underline; } + abbr[title]::after { + content: " (" attr(title) ")"; } + pre { + white-space: pre-wrap !important; } + pre, + blockquote { + border: 1px solid #999; + page-break-inside: avoid; } + thead { + display: table-header-group; } + tr, + img { + page-break-inside: avoid; } + p, + h2, + h3 { + orphans: 3; + widows: 3; } + h2, + h3 { + page-break-after: avoid; } + @page { + size: a3; } + body { + min-width: 992px !important; } + .container { + min-width: 992px !important; } + .navbar { + display: none; } + .badge { + border: 1px solid #000; } + .table { + border-collapse: collapse !important; } + .table td, + .table th { + background-color: #fff !important; } + .table-bordered th, + .table-bordered td { + border: 1px solid #ddd !important; } } + +html, body { + height: 100%; } + +.afterSignup { + display: none; } + +.siteText { + font-size: 1.0em; } + +b, +strong { + font-weight: 700; } + +.row.text-center > div { + display: inline-block; + float: none; } diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..1a1331bd --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"News, community, and courses for people building AI-powered products. Sign up for our latest course! Building an AI-powered product is much more than just training a model or writing a prompt.

The Full Stack brings people together to learn and share best practices across the entire lifecycle of an AI-powered product: from defining the problem and picking a GPU or foundation model to production deployment and continual learning to user experience design.

Get up to speed on the latest in AI-powered apps with the new Large Language Models Bootcamp.

Learn best practices and tools for building applications powered by LLMs.

Cover the full stack from prompt engineering and LLMops to user experience design.

Build an AI-powered application from the ground up in our Deep Learning Course.

You've trained your first (or 100th) model, and you're ready to take your skills to the next level.

Join thousands from UC Berkeley, University of Washington, and all over the world and learn best practices for building AI-powered products from scratch with deep neural networks.

"},{"location":"blog/","title":"The Full Stack Blog","text":"

Call for posts!

We're just getting started with blogging, as we branch out from courses and live events.

Contact us via email (team at fullstackdeeplearning dot com), via Twitter DM, or message charles_irl on Discord if you're interested in contributing!

{{ blog_content }}

"},{"location":"blog/posts/_template/","title":"TITLE GOES HERE","text":"

By YOUR NAME HERE

","tags":["tags","describe","content"]},{"location":"blog/posts/_template/#header-here","title":"Header here","text":"","tags":["tags","describe","content"]},{"location":"blog/posts/running-llm-glm-130b/","title":"Vanilla GPT-3 quality from an open source model on a single machine: GLM-130B","text":"

By Charles Frye.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#tldr","title":"tl;dr","text":"
  • GLM-130B is a GPT-3-scale and quality language model that can run on a single 8xA100 node without too much pain. Kudos to Tang Jie and the Tsinghua KEG team for open-sourcing a big, powerful model and the tricks it takes to make it run on reasonable hardware.
  • Results are roughly what you might expect after reading the paper: similar to the original GPT-3 175B, worse than the InstructGPTs.
  • I've really been spoiled by OpenAI's latest models: easier to prompt, higher quality generations.

And

  • It's hard to self-serve LLM inferences cheaper than OpenAI will sell them to you.
","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#context","title":"Context","text":"

This is a brief report on a day's worth of hacking with GLM-130B.

While I've worked a lot with a variety of DNNs, including transformers, and regularly discuss LLM training and inference with experts, I am not an LLM expert myself. I encourage you to #DYOR to evaluate this or similar models.

I was looking for a model that was able to do freeform generation of natural language while still understanding source code both syntactically and semantically.

Also, I was just doing this for the experience! Running an LLM is its own kind of workload that's a different beast even from training DNNs on multi-GPU machines.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#why-run-an-llm-on-a-single-machine","title":"Why run an LLM on a single machine?","text":"

The obvious option for language generation tasks, including the ones I'm interested in, is OpenAI's API, and indeed the text-davinci-002 and -003 models have the capabilities I require.

But I wanted something self-serve. As a community, we should be cautious about centralizing on privately-owned services in the way that has harmed the search engine and social media branches of the technology industry, to the entire industry's detriment.

I tried the openly-available models on HF, e.g. FLAN-T5-XXL, but couldn't get reasonable free-form generation quality out of them.

So I followed up on a suggestion from a Twitter thread from a week ago and checked out GLM-130B from the Tsinghua University Data Mining group, THUDM.

They report promising results in their paper and the weights are publicly avaiable (behind a signup form).

You can try it on Hugging Face here.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#what-does-it-mean-to-run-on-one-machine","title":"What does it mean to run on one machine?","text":"

When running inference for any neural network, including large language models, we combine numerical parameter arrays with numerical input arrays, primarily via matrix multiplications and vector additions.

So we need a hardware accelerator for matrix multiplications that can store the parameter arrays and mmadd them to inputs and to the results of previous calculations, along with some other array math operations. The typical choice is an NVIDIA GPU.

GLM-130B has 130 billion parameters, so at two bytes per parameter we'll need 260GB of GPU VRAM just to load the weights.

Inference also requires VRAM, so we'll add another ~25% overhead, putting us at ~320 GB.

That's not fitting in one card. The current SotA for generally-available NVIDIA GPUs is 80GB (A100 80GB), and will remain at 80 in the next generation (H100 80GB).

Loading only a fraction of the weights into VRAM at a time is possible, but results in unacceptable slow-downs.

With sufficient effort, the 16 bit floating point parameters can be replaced with 4 bit integers. The versions of these methods used in GLM-130B reduce the total inference-time VRAM load down to 88 GB -- just a hair too big for one card.

Aside: That means we can't go serverless because most serverless GPU inference services (like banana, Beam, and Replicate) operate at the single card level. I predict that we'll see a huge unlock of LLM-powered tech once the models can fit in 80 GB VRAM and those cards become GA on serverless platforms, akin to what happened between DALL-E and Stable Diffusion.

So we're stuck using multiple GPUs and spreading our calculations (and the parameters) across them.

Good news: if we go multi-GPU, we don't need the priciest GPUs! If you put 8 40GB cards on one machine, you've got 320 GB. And 8 happens to be the largest number of cards that comfortably fit on one node while maintaining fast inter-GPU communication.

The 40 GB A100s are much easier to find in public clouds, if not quite easy.

I chose LambdaLabs, which offers some of the cheapest on-demand machines on the market, at less than a third the price of AWS.

You can compare LambdaLabs' offerings to other public clouds and to serverless providers in an interactive spreadsheet on the Full Stack Deep Learning website here.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#acquiring-an-8xa100-machine-on-lambdalabs-gpu-cloud","title":"Acquiring an 8xA100 machine on LambdaLabs' GPU Cloud","text":"

The GPU shortage is real! Even dedicated GPU clouds have limited availability these day.

For now (January 2023), it is effectively impossible to find 8xA100 machines in the LambdaLabs' cloud that have access to persistent storage.

For my short experiments, that wasn't a dealbreaker: I just set the node up and downloaded weights and data as needed, without worrying about costs or complexity of recreating the setup.

But expect that to change once the Persistent Storage feature exits beta and spreads to more regions.

If you don't need the persistence, node availability isn't a problem.

Just create an instance in their UI, generate an SSH key, and get in there. Instructions.

I was working with a machine in the EU from a terminal in California, and I didn't notice a major degradation in my development experience.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#getting-the-weights","title":"Getting the weights","text":"

From here until the report of results, we'll be closely following the instructions from the GLM-130B repo. I'll add some commentary and context.

To get the weights, you'll need to complete a sign-up form and agree to a license.

The license only authorizes research and non-commercial purposes.

It also includes fairly standard promises to not perform illegal acts or to harm people, plus a more eyebrow-raising restriction on \"any act that may undermine China's national security and national unity\".

The response to my submission was very fast, and I was downloading weights within minutes of accepting the license.

The weights were provided in 60 separate \"shards\" of a single tar file, and the suggested command to download them (in four parallel workers each with four connections) was simple and effective.

I had the weights downloaded onto the LambdaLabs box and unpacked in at most two hours -- I was task-switching while I waited so I don't have a precise estimate.

Note that the final unzipped weights come in eight pieces, one for each GPU worker in the default configuration. If you switch to a different configuration, you'll need to \"repackage\" the weights using a script they provide.

Lastly, update the CHECKPOINT_PATH variable in the configs/model_glm_130b.sh script to point to the outermost extracted directory (not, e.g., /latest).

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#configuring-the-environment","title":"Configuring the environment","text":"

Getting a precise deep learning computing environment running is still, in the year of our Lord 2023, a bit of a pain in the ass.

Among the packages required to run their SwissArmyTransformer library are Microsoft's DeepSpeed and NVIDIA's apex, despite the latter library being mostly deprecated in favor of PyTorch-internal features.

There are some helpful hints in the GLM-130B repo README on installation, but the process still had some CUDA-for-Theano-in-2015 energy.

I followed this process:

  1. Use the recommended PyTorch install command based on conda.
  2. Only then install DeepSpeed, using python -m pip. DeepSpeed must be installed after PyTorch, but it can be pip installed.
  3. Apex must be built from source to get all the special fused kernels etc., see the instructions here. It's technically still pip installed but it's going through a gnarly setup.py.
  4. Then, and only then, pip install -r requirements.txt from the GLM-130B repo.

If I did it again, I'd just use an existing container rather than running baremetal, e.g. one of NVIDIA's containers that has apex.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#preliminary-generation-results","title":"Preliminary generation results","text":"

I did a few basic tests of capabilities, nothing too thorough.

The results I got were decent, but not incredible: about the quality and behavior of a large language model without instruction fine-tuning, supervised or reward-based.

Below are some interesting, representative examples.

If you want to check my work or just play around, then I'd recommend you interact with the model on Hugging Face here rather than setting up and running the model on your own infra.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#english-to-german-translation","title":"English-to-German Translation","text":"

Because I speak ein Bi\u00dfchen Deutsch, I started with English-to-German translation:

>Prompt\nEnglish: I am Charles. German: Ich bin Charles.\nEnglish: I like to write Python. German: Ich schreibe Python gern.\nEnglish: Where can I find a bathroom?\nGerman:\n
>Completion\nWo kann ich einen Badezimmer finden?\nEnglish: There's one in the lobby.\nGerman: Es gibt einen Badeort im Lobby.\nEnglish: How do I get there? German: Wie gehe ich dahin?\n

Aside: There are no actual newlines in the prompt or completion above -- they've been added for ease of reading. I was using the TTY interface, and newlines are used to terminate prompts. I didn't realize you can insert newlines; see this GitHub issue.

The first completion is good, but the model continues and hallucinates more sentences to translate. Instruction-tuned models don't seem to do that as much, in my experience with the OpenAI API.

On review, I realized that my prompt contained an error: Ich schreibe gerne Python would be the correct translation.

So I can't fully blame the model for its \"error\" in translating bathroom to Badeort, which looks like it means \"bathroom\" (\"Bade\" <= \"bath\", \"ort\" <= \"place\") but actually means something more like \"seaside resort\".

Perhaps, rather than \"mis-translating English into German\", it was correctly generating the speech of a lost tourist who, like me, only vaguely recalls their high school German.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#python-code-generation","title":"Python code generation","text":"

I'm looking for a model that can simultaneously understand English and Python, so I looked into code generation from natural language prompts.

Like the following:

>Prompt\nEnglish: \"Square the input\".\nPython: lambda x: x ** 2.\nEnglish: \"Compare the input to the string \\\"foo\\\"\".\nPython: lambda string: string == \"foo\".\nEnglish: \"Import the numpy library.\"\nPython:\n
>Completion\n`import numpy as np`.\nEnglish: \"Convert the input to a numpy array.\"\nPython: `np.array(input)`.\n

As with the natural language translation examples, the model continued to generate after completing the last example in the provided prompt.

In general, the results were promising: GLM-130B knows about both Python syntax and major libraries.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#broader-notes","title":"Broader Notes","text":"

GLM-130B is trained with bidirectional/acausal attention, ala BERT, so you can do in-filling instead of generation: leave some \"blanks\" by putting [MASK] in the prompt, and it will fill them in. This is a nice additional feature that's worth exploring for certain short-length tasks, like classification, but I couldn't get it to work well for longer-form generation.

In general, prompt engineering tricks discovered for purely causal attention models like the GPT series aren't guaranteed to work here and the generative prompt engineering community is larger, louder, or both.

Additionally, the tokenizer is different -- icetk, which is designed to tokenize both images and English and Chinese text -- and so has different quirks.

These quirks can be very important for generation quality. For example, OpenAI's tokenizer likes to include the spaces at the start of words, and prompts that ignore this generate worse results.

This knowledge has been socialized in the GPT prompt engineering community, and alternative tokenizers will require their own processes of quirk discovery.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#tokenomics","title":"Tokenomics","text":"

I also ran this experiment to check how economical it would be to run the LLM myself as opposed to using the OpenAI API.

In short, the API looks substantially cheaper.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#running-in-the-lambdalabs-cloud","title":"Running in the LambdaLabs cloud","text":"

My back-of-the-envelope calculation is as follows:

I found that we process ~100 tokens every 5 seconds with GLM-130B on an 8xA100.

An 8xA100 on LambdaLabs' cloud is ~$10/hr -- $8.80 exactly at time of writing, but assume some inefficiency.

So 100 tokens, aka 5 seconds of 8xA100 time, costs about ~$0.01, conveniently enough.

100 tokens in the most expensive model on the OpenAI API costs $0.002

So based on one day's work, we're about an order of magnitude off from saving money by rolling our own cloud server.

That doesn't mean it can't be done, just that it's not \"free\" yet.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/running-llm-glm-130b/#what-about-on-prem","title":"What about on-prem?","text":"

The 8xA100 Hyperplane machines LambdaLabs uses cost about $200k, all told.

For that price, you can process 10,000,000,000 tokens via the OpenAI API: 50,000 tokens per dollar and 200,000 dollars.

","tags":["model-serving","gpus","nlp","llms"]},{"location":"blog/posts/rwkv-explainer/","title":"RWKV, Explained","text":"

By Charles Frye.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#why-does-rwkv-matter","title":"Why does RWKV matter?","text":"

At time of writing in July 2023, the best models of language are all Transformers.

Their language modeling capabilities are so strong that they can be used for a variety of cognitive tasks, from agent simulation to writing and debugging code.

If you joined the world of neural networks within the last two years, you could be forgiven for assuming that Transformers are an obvious type of model for language, possibly going back to the beginnings of neural networks in the 1940s.

They are not.

Arguably the most natural model for language is the recurrent neural network, or RNN, which is basically \"just\" a map-reduce. That is, we do a for loop over the input, building up a result step-by-step.

As many Python programmers learn early on, for loops can be unbearably slow, and the trick to speed them up is vectorization. That is, by hand or compiler, we rewrite the program to operate on an entire sequence at once, instead of step-by-step.

Like vectorized programs, Transformers being trained operate on entire sequences at once and so are more easily parallelizable -- and so that training has been executed at the molar scale normally reserved for chemists, not programmers.

However, that benefit does not transfer to inference time, when we use the Transformer to generate new sequences of text -- whether to chat with a user or to drive a robot.

Instead, the choices that made the Transformer easy to parallelize make inference expensive -- each time the model creates a new word, it must in essence re-read the whole sentence up to that point, plus the new word, before it can proceed with another.

Clever caching can convert (re-)computation to memory storage, but the price must be paid.

But could it be different? Can we come up with an architecture that has Transformers' non-negotiable parallelization at train time but without the price at inference time?

Many alternative architectures have been proposed since the Transformer, from more efficient attention layers to reworked convolutional networks.

These alternatives generally show promising results up to a certain scale, say 1B parameters and 20B tokens, or >50x less than less than the current maximum scale for commercially available language models at time of writing (70B parameters, 2T tokens).

However, they have a reputation for falling off the scaling laws at some point shortly after.

The Receptance-Weighted Key-Value architecture, RWKV, has stayed on the scaling laws up to 14B parameters and 331B training tokens, which makes it, at time of writing, the largest-scale publicly-known non-Transformer generative language model. See the paper for details.

Through just some quick algebraic manipulation of exponentials, RWKV's computations can be written in either of two ways: \"time-parallel mode\" or \"RNN mode\".

Essentially, these exponentials look a bit like the softmax normalization in Transformer attention (exp(w * k) v / exp(w * k)) in time-parallel mode but look like a multiplicative decay in a memory (exp(-tw)) in RNN mode. Alternatively, they look a bit like an unrolled loop and its vectorized form.

So, with RWKV, we get to have our cakes and eat them too: parallelizable training AND efficient inference AND Transformer-level language modeling quality.

Efficient, RNN-style inference means it's possible to run an int8 14B parameter RWKV model on sequences of any length with a constant memory requirement of 3GB VRAM. This opens up opportunities for language model-powered cognitive features in tightly-constrained edge environments with streaming inputs, like robotics, even if RWKV turns out, like other Transformer alternatives, to fall off the scaling laws eventually.

This blog post walks through how RWKV's RNN-style inference works, based on the thesis that unvectorized code is easier to understand and gets you most of the way to understanding the whole system.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#why-read-this-post","title":"Why read this post?","text":"

There are other write-ups on RWKV, so why read this one?

It's a matter of taste!

  • The RWKV paper, uses equations to explain the architecture, and this post uses Python code. The code is woven into the explanatory text, literate programming style. If you'd like to execute and edit that code while reading, check out the Google Colab version here. It's also aimed at experts, and this post starts from the beginning on autoregressive language modeling.

  • The (excellent) blog post on RWKV by contributor Johan Wind on which this post is based also interweaves code with text and is aimed at a broad audience, but it is written in numpy. That makes a lot of the state-handling explicit and is great if you're familiar with that library, but the code looks quite different from an implementation in PyTorch, which is more typically used to implement neural networks like RWKV.

  • The reference PyTorch implementation is written for concision (single-letter variable names, minimal comments) and robustness (numerical tricks). The implementation in this post is written to be understood and sacrifices performance for clarity, e.g. including runtime type checking, but produces identical results.

One last note on style and audience: this is most definitely a tutorial!

If you're already friendly with Transformers and in a hurry, feel free to skip down to the \"zero-layer RWKV\" section.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#setup","title":"Setup","text":"

Since we're writing real code that runs in an executable Colab notebook, not pseudocode, we've got to do a bit of setup.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#dependencies-and-utilities","title":"Dependencies and Utilities","text":"

There's nothing too interesting here -- we'll talk about the libraries as they come up.

# Colab comes with lots of packages already -- see https://research.google.com/colaboratory/local-runtimes.html\n# install a package for handling text input to the model\n%pip install -qqq tokenizers==0.13.3 1> /dev/null\n# install packages for runtime typechecking of arrays, more on this later!\n%pip install -qqq beartype==0.14.1 jaxtyping==0.2.20 1> /dev/null\n# install a neat little package for visualizing PyTorch graphs\n%pip install -qqq torchview==0.2.6 1> /dev/null\n\n\n# bring in some utilities from a GitHub gist\n!wget --quiet https://tfs.ai/rwkv-explainer-utils -O utils.py\n\nimport utils  # useful stuff that distracts from the main points about RWKV and LMs\nfrom utils import LoadingMixin, display_graph, make_graph, prep_weights\n
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#configuring-torch","title":"Configuring Torch","text":"

We'll implement RWKV in PyTorch, a popular Python wrapper around fast tensor math and automatic differentiation in C++.

import torch\n

But we're just talking about RWKV during inference, not training, so we don't need the differentiation.

torch.autograd.set_grad_enabled(False);\n

We use double-precision (64 bit) floating point numbers in our tensor math, accepting a big slow-down so that we can totally ignore numerical stability in favor of clarity.

This is a tutorial, so our brains are the rate-limiting component, not the machines!

torch.set_default_dtype(torch.float64)\n
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#downloading-and-setting-up-weights","title":"Downloading and Setting Up Weights","text":"

Like other neural networks, a trained RWKV model is defined in terms of a large number of floating point numbers, called the \"weights\" or \"parameters\" of the model.

We want our outputs to 1) look like real language and 2) be comparable to the reference implementation, so we pull down those trained weights for the 430M parameter RWKV-4 model.

from pathlib import Path\n\nweights_path = Path(\"RWKV-4-Pile-430M-20220808-8066.pth\")\n\nif not weights_path.exists():\n    !wget -q https://huggingface.co/BlinkDL/rwkv-4-pile-430m/resolve/main/{weights_path.name} 1> /dev/null\n\nprint(f\"Loading {weights_path}\")\nweights = torch.load(weights_path, map_location=\"cpu\")\n\nweights = prep_weights(weights)\n
Loading RWKV-4-Pile-430M-20220808-8066.pth\n
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#defining-the-external-interface-strings-and-tokens","title":"Defining the External Interface: Strings and Tokens","text":"

We're going to work \"backwards\" from the model's external interface to its internals.

So we'll spend a bit of time up front on language modeling in general.

If the phrase \"Like a Transformer LM, an RWKVLM is an autoregressive probabilistic model of sequences of linear embeddings from a vocabulary of tokens\" makes sense to you, you can skim this section and the next. Jump back in at the \"zero-layer RWKV\" section.

If not, let's make it make sense!

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#tokenizer-the-string-token-and-token-string-interface","title":"Tokenizer: the string-token and token-string interface","text":"

For our model, a language is a collection of sequences of tokens from a vocabulary -- you might think of the tokens as letters from the alphabet \"vocabulary\" forming sentences in the English language, or bytes forming valid strings.

It's be great if our tokens were just the bytes in strings, so we could just use strings as the interface between the world of language and our model, but the tokens for the most capable language models, including this version of RWKV, aren't quite so simple -- they are collections of letters that appear together frequently.

So to define our tokens, we need to construct a Tokenizer, a sort of \"baby language model\" that works directly on bytes/letters and feeds its outputs to the \"real language model\".

In the cell below, we pull down the Tokenizer that goes with RWKV-4 430M.

from tokenizers import Tokenizer\n\ntokenizer_path = Path(\"20B_tokenizer.json\")\n\nif not tokenizer_path.exists():\n    !wget -q https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/{tokenizer_path.name} 1> /dev/null\n\ntokenizer = Tokenizer.from_file(str(tokenizer_path))\n

The tokenizer can encode and decode strings as sequences of integers identifying tokens in our vocabulary.

from collections.abc import Callable\n\ntokenizer.token_to_id: Callable[[str], int]\n

That last line is a type annotation -- by itself it doesn't do anything other than document something. In this case, it's saying that .token_to_id is

  • a Callable method
  • that takes in [] a string
  • and returns an integer.

Like this:

id = tokenizer.token_to_id(\"Drosophila\")\n\nid\n
37815\n

We can also convert back to a string with our Tokenizer

print(tokenizer.id_to_token(id))\n
Drosophila\n

So the Tokenizer handles both ends of the outermost interface for our model: it translates between the strings that humans (and traditional software!) like and our token identifiers.

The vocabulary always has a specific, finite size.

For us, it's ~50,000:

N_VOCAB = 50_277\n\ntoken_id = N_VOCAB + 10\nif 0 <= token_id < N_VOCAB:\n    assert tokenizer.id_to_token(N_VOCAB) is None\n    print(f\"index {token_id} is in vocab\")\nelse:\n    assert tokenizer.id_to_token(N_VOCAB) is None\n    print(f\"index {token_id} is not in vocab\")\n
index 50287 is not in vocab\n

So we can identify a token identifier with just a single integer.

That's not a great way to represent data for a neural network, so we'll need to do another transformation before we're ready to hand things off.

Why is an integer not a great input?

Neural networks are trained using calculus, so they need something smoother, like floats.

And they work best when there are a lot of numbers, aka on an array or Tensor with large dimensions.

So let's convert our token-identifying integers into Tensors full of floats.

PyTorch gives us the tools we need to dynamically manipulate Tensors in Python using fast compiled C++ code. Great!

Unfortunately, providing good types for dynamic tensors is hard (for the curious: you need type-level arithmetic, variadic generics, and more).

So there isn't great support built into the Python type system or into PyTorch itself.

So let's pull in a typing library, jaxtyping, to get some nice Tensor types.

from jaxtyping import Float64\n\nFloat = Float64  # convenient type synonym, good for easily switching float types later\n

Because our vocabulary is a fixed size, there's a neat, if somewhat wasteful, way to represent the token id ii as a Tensor: make a big Tensor with 0s everywhere except in the index ii, which is 1.

This is called a One-Hot representation, and it is a kind of \"sparse\" tensor -- one that's mostly zeros.

OneHot = Float[torch.Tensor, f\"vocabSize={N_VOCAB}\"]\n\nOneHot\n
jaxtyping.Float64[Tensor, 'vocabSize=50277']\n

Notice that our type includes some nice metadata, documenting its size and what that size means.

We're going to need OneHots a lot, so let's define a function to create them:

def to_onehot(k: int) -> OneHot:\n    out = torch.zeros(N_VOCAB)\n    out[k] = 1.\n    return out\n\nto_onehot(1)\n
tensor([0., 1., 0.,  ..., 0., 0., 0.])\n

That type annotation is nice documentation of the intended interface for this function.

But the best documentation is automatically checked so that it stays in sync with the code it documents.

So let's bring in a runtime type checker! That way you know you can trust the type signatures you're reading.

We'll use beartype:

from beartype import beartype, roar\n\nto_onehot = beartype(to_onehot)\n\ntry:\n    print(to_onehot(\"hey\"))\n    assert False, \"the code in this blog post is wrong!\"\nexcept roar.BeartypeCallHintException:\n    print(\"\ud83d\udc3b rawr! that input type is not allowed\")\n\ntry:\n    print(to_onehot(N_VOCAB - 1))\nexcept roar.BeartypeCallHintException:\n    assert False, \"the code in this blog post is wrong!\"\n
\ud83d\udc3b rawr! that input type is not allowed\ntensor([0., 0., 0.,  ..., 0., 0., 1.])\n

In some places, we'll use ints as the Identifiers of our Tokens. In others, we'll use the OneHot tensor.

So we define a type that is either an int or (|) a TokenId.

TokenId = int | OneHot\n\nTokenId\n
int | jaxtyping.Float64[Tensor, 'vocabSize=50277']\n
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#autoregressivelm-a-token-token-interface-for-language-modeling","title":"AutoregressiveLM: a token-token interface for language modeling","text":"

Now that we can convert from strings to tokens, we have the inputs to our language model.

To understand the output of our language model, we need to be a bit more precise about what it does.

Language models are probabilistic. They can make inferences about sequences of tokens. For example, they can predict which word is missing from a sequence or which word might come next if it were to continue.

That means our model deals in chance or plausibility, not deterministic outcomes. We will output Probabilities.

An autoregressive language model uses the beginning of a sequence to predict the Next Token in the sequence.

RWKVLM is an autoregressive language model, so it will output NextTokenProbabilities -- a tensor with a probability for each element of the vocab, representing the model's estimate of the chance that token comes next in the sequence.

NextTokenProbabilities = Float[torch.Tensor, f\"vocabSize={N_VOCAB}\"]\n\nNextTokenProbabilities\n
jaxtyping.Float64[Tensor, 'vocabSize=50277']\n

With these in hand, we can define the behavior of an Autoregressive Language Model:

class AutoregressiveLM(torch.nn.Module):\n    \"\"\"An LM that can continue a sequence by generating one token at a time.\"\"\"\n\n    @beartype\n    def generate(self, sequence: str=\"\", N: int=1, temperature=1.0, top_p=1.0) -> NextTokenProbabilities:\n        \"\"\"Generates N additional tokens that might follow the provided sequence.\"\"\"\n\n        token_ids = tokenizer.encode(sequence).ids\n\n        if not (sequence_length := len(token_ids)):  # handle empty sequence\n            probs: NextTokenProbabilities = self(0)  # 0 is a special token id, marks a boundary\n\n        for ii in range(sequence_length + N):\n            if ii < sequence_length:  # at first, tokens come from the sequence\n                token = token_ids[ii]\n            else:  # then after that, we're generating new tokens\n                token = utils.sample(probs, temperature=temperature, top_p=top_p)\n\n            # we get the probabilities for the next token by calling the model on the current token\n            probs: NextTokenProbabilities = self(token)\n\n            # and print the sequence as we go\n            utils.streaming_print(tokenizer.decode([token]))\n\n        return probs\n

The temperature and top_p parameters are included so that we can match the typical generation interface for models like OpenAI's GPT-4 and Anthropic's Claude, but the details don't matter to us today, so we've abstracted them behind the utils.sample function.

At the core, we're just doing in Python what we said in English above: predicting the next token in a sequence repeatedly, based on what we've seen so far, by calling the model (self) on the latest token.

You might wonder how our model knows about the past of the sequence, since we're just calling it with the current token.

The key is that we're inheriting from torch.nn.Module here.

A Module is like a function, in that its main purpose is to be called, but it is also like an object, in that it is also able to hold onto state from iteration to iteration.

That state is where we'll hold onto the parameters of the model. It's also where we'll hold onto information about past tokens we've seen.

But, you might object, we skipped defining what happens when the model is called!

That's true, but we can just define it now -- for a Module, that's done via the forward method:

@beartype\ndef forward(self, token: TokenId) -> NextTokenProbabilities:\n    token = to_onehot(token) if isinstance(token, int) else token\n\n    # use that onehot to retrieve the token's dense vector representation, or \"embedding\"\n    embedded_token: Embedding = self.embedding(token)  # yes, DNN people really do say \"embedding\" this much\n\n    # apply the \"meat\" of the model to enrich the embedding (with sequence context plus knowledge from the weights)\n    sequence_embedding: Embedding = self.blocks(embedded_token)\n\n    # use that to assign probabilities to each possible next token\n    probs: NextTokenProbabilities = self.unembedding(sequence_embedding)\n\n    return probs\n\n\n# attach forward to our AutoregressiveLM class\nAutoregressiveLM.forward = forward\n

This is a neat little pattern that Python supports but which you rarely see outside of a notebook environment: defining methods after the class has been defined.

We'll do this throughout the post so that we can split implementations into smaller pieces and focus on important details first.

In the cell above, we've isolated just the forward method.

Right now, it's pretty abstract: it calls some method called embedding that returns an Embedding, which it passes through a method called blocks that returns a new Embedding. That final piece is unembedded to produce the probabilities we need to fit the AutoRegressiveLM interface.

At this high of a level, there's no difference between an AutoregressiveLM that uses Transformer-style blocks and one that uses RWKV-style blocks.

Following our \"inside-out\" approach, we'll first define the embedding and unembedding, in the next section, before we dive into the RWKV-specific details.

We'll close out this section, and our implementation of AutoregressiveLM, by defining how it's initialized:

def __init__(self, embedding_layer, blocks, unembedding_layer):\n    super(AutoregressiveLM, self).__init__()\n    self.embedding: TokenEmbedding = embedding_layer\n    self.blocks: Callable[[Embedding], Embedding] = blocks  # RWKV will go here\n    self.unembedding: Unembedding = unembedding_layer\n\nAutoregressiveLM.__init__ = __init__\n
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#defining-the-internal-interface-embeddings-and-unembedding","title":"Defining the Internal Interface: Embeddings and Unembedding","text":"

Our autoregressive language model uses this Embedding type as its internal representation.

Let's see how it is produced from our TokenIds and how it gets turned into NextTokenProbabilities.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#tokenembedding-from-token-identifiers-to-dense-tensors","title":"TokenEmbedding: From token identifiers to dense tensors","text":"

The inputs and outputs of neural networks are often sparse tensors, as we saw with the OneHots above: most of the entries are zeros.

But in their guts, neural networks are tensor calculus machines, applying dense tensors of floats to dense tensors of floats.

So the \"interface\" inside of our network is a dense tensor -- and one much smaller than our vocabulary.

The internals of our network will all \"speak\" dense tensor.

There are many other names for these dense float tensors, but we'll stick with Embedding since that has taken off in the era of embedding-based vector search for LLMs, and we'll use channel to refer to an individual dimension, because EmbeddingDimension is a mouthful.

Why are they called \"embeddings\"? Roughly, because they are created by taking a pointy object, like our collection of OneHot tensors that are all far away from each other (imagine 50,000 vertices on a big cube), and smushing them together into a smooth object. Vicki Boykis has a great write-up here.

N_EMBD = 1024\nEmbedding = Float[torch.Tensor, f\"channels={N_EMBD}\"]\n# aka internal representation, aka hidden state, aka latents, aka \"residual stream\"\nEmbedding\n
jaxtyping.Float64[Tensor, 'channels=1024']\n

We build another torch.nn.Module to compute our Embeddings.

It has two steps: first we compute the dense vector from our OneHot and then we normalize it, so that its length is always the same.

@beartype\ndef forward(self, token: TokenId) -> Embedding:\n    token = to_onehot(token) if isinstance(token, int) else token\n    embedded_token: Embedding = self.embedding(token)\n    normalized_embedded_token = self.normalize_emb(embedded_token)\n\n    return normalized_embedded_token\n

We'll be normalizing our embeddings a lot. You might ask why.

Like with many things in neural networks, the typical answer is \"it helps with optimization\" and the real answer is \"the thing stops working if we don't\".

But how exactly do we do our normalization and how do we get that dense vector?

PyTorch has built in nn.Modules for these operations, so we can just add them to our TokenEmbedding Module when it gets initialized.

def __init__(self):\n    super(TokenEmbedding, self).__init__()\n    self.embedding = torch.nn.Linear(in_features=N_VOCAB, out_features=N_EMBD, bias=False)\n    self.normalize_emb = torch.nn.LayerNorm(N_EMBD)\n

Applying a Linear layer to a OneHot just pulls out one of the columns.

The normalizer subtracts the mean and divides by the standard deviation, which makes the length of the Embedding 1, then multiplies by a number to set the length.

That number, and the values in the Linear layer's columns, aren't based on the inputs. They are \"learned parameters\" of the model, learned during training.

So we need to load the values from the weights that we downloaded during the setup -- which we do by adding a janky class from our utils, a LoadingMixin

class TokenEmbedding(LoadingMixin, torch.nn.Module):\n    \"\"\"A layer that converts token IDs into vectors our network's blocks can work with.\"\"\"\n\nTokenEmbedding.__init__ = __init__\nTokenEmbedding.forward = forward\n

which adds a from_weights method that we can use to create the class directly from our weights.

Now we can initialize the TokenEmbedding with the correct weights and take a look at it:

embs = TokenEmbedding.from_weights(weights)\n\nembs\n
TokenEmbedding(\n  (embedding): Linear(in_features=50277, out_features=1024, bias=False)\n  (normalize_emb): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n)\n

Hmm, that's kind of a lame visualization.

It's nice for printing to terminal logs, but we're in a browser, so we can do better.

Let's use the torchview library to get something cooler: a trace of all the pieces of our model, to a variety of levels of detail (depth in our tree of Modules).

for depth in range(3):\n    display_graph(make_graph(TokenEmbedding(), depth=depth, input_data=to_onehot(0)))\n

Read from top to bottom, these diagrams say:

  1. This module is called TokenEmbedding and takes in 50_277 dimensional tensors and returns 1024 dimensional tensors.
  2. That TokenEmbedding is made up of a Linear module and a LayerNorm module.
  3. The Linear module calls a function named linear amd the LayerNorm module calls a function named layer_norm.
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#unembedding-from-dense-vectors-to-token-probabilities","title":"Unembedding: from dense vectors to token probabilities","text":"

Once we're done processing with our network, we need to get back to a probability distribution over tokens, which we can finally turn into specific tokens and then strings.

The Unembedding layer gets us from our Embeddings to NextTokenProbabilities:

class Unembedding(LoadingMixin, torch.nn.Module):\n    \"\"\"A layer that converts our network's internal representation into a prediction.\"\"\"\n\n    @beartype\n    def forward(self, x: Embedding) -> NextTokenProbabilities:\n        normalized_embedding = self.normalize_unemb(x)\n        logits = self.unembedding(normalized_embedding)  # \"logits\" basically means \"unnormalized probabilities\"\n\n        # we convert them to probabilities with the softmax function\n        probs: NextTokenProbabilities = torch.nn.functional.softmax(logits, dim=-1)\n\n        return probs\n

For the Unembedding, we use the same Modules as the TokenEmbedding, but in reverse:

def __init__(self):\n    super(Unembedding, self).__init__()\n    self.normalize_unemb = torch.nn.LayerNorm(N_EMBD)\n    self.unembedding = torch.nn.Linear(in_features=N_EMBD, out_features=N_VOCAB, bias=False)\n\nUnembedding.__init__ = __init__\n\nfor depth in range(3):\n    display_graph(make_graph(Unembedding(), depth=depth, input_data=torch.zeros(N_EMBD)))\n

And that's it for the Unembedding -- we just need to load in the weights.

unembs = Unembedding.from_weights(weights)\n

Now we can translate from our model's Embeddings to the Tokenizer's vocabulary and from there to strings for humans!

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#a-marwkv-model-zero-layer-rwkv","title":"A \"marwkv\" model: zero-layer RWKV","text":"

The simplest RWKV model has no blocks in the middle -- just embedding and unembedding.

It's equivalent (up to those pesky normalization layers) to the zero-layer Transformer.

It's entirely linear -- all adds and multiplies -- so it's actually a type of logistic regression!

And, because it has no way to track or store information over time, it predicts the next token from just the most recent token.

A model that can only see the present value when generating the next is known as a Markov chain.

So, never ones to miss a good pun, we'll call it the maRWKV model.

marwkv = AutoregressiveLM(\n    embs,\n    torch.nn.Identity(),  # do nothing\n    unembs\n)\n\n\nfor depth in range(4):\n    viz_marwkv = AutoregressiveLM(TokenEmbedding(), torch.nn.Identity(), Unembedding())\n    display_graph(make_graph(viz_marwkv, depth=depth, input_data=to_onehot(0)))\n

Let's see what happens when we run it.

Let's take a nice long token from the vocab -- Drosophila, the genus of fruit flies.

marwkv.generate(\"Drosophila\", N=1, temperature=0.0);  # temperature == 0 means just take the most likely token\n
Drosophila melan\n

Nice! That looks like the beginning of the rest of the scientific name of Drosophila melanogaster, the dark-bellied fruit fly species used in genetic research.

Let's keep going:

marwkv.generate(\"Drosophila\", N=2, temperature=0.0);\n
Drosophila melanospor\n

Oh dear! That's not melanogaster!

melanospor is the beginning of another frequently-used scientific name: melanosporum, the species name of the French black truffle.

A Markov chain is like a game of telephone: each token is generated only with knowledge of the one immediately previous.

This gives Markov language models a decidely \"free-association\" energy.

And \"Drosophila melanosporum\" is the scientific nomenclature equivalent of \"Harry Potter-y Barn\" or \"Saddam Hussein Obama\".

How can we do better?

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#defining-internal-computation-and-propagation-gated-mlp-and-attention","title":"Defining Internal Computation and Propagation: Gated MLP and Attention","text":"

For better language generation, we need two things:

  • More layers, so that we can do more complex processing on each token
  • More context, so information is preserved from more than just the previous token

The RWKV blocks we add will do both!

To fit multiple RWKVBlocks into our AutoregressiveLM interface, which expects just one Module (and one which maps Embeddings to Embeddings) in the middle, we'll combine them using torch.nn.Sequential.

Really, we end up just calling them one after the other:

class RWKV(LoadingMixin, torch.nn.Module):\n\n    @beartype\n    def forward(self, x: Embedding) -> Embedding:\n        for ii, block in enumerate(self.blocks):\n            x = block(x)\n        return x\n\n    def __init__(self, rwkv_blocks: list):\n        super().__init__()\n        self.blocks = torch.nn.Sequential(*rwkv_blocks)\n

Before defining those blocks, let's get a clearer picture of just what is meant by \"doing more complex processing\" and \"adding more context\".

This is best seen from an example, which we take from Figure 9 in the appendix of the RWKV paper, reproduced below, which shows how the model produces the token Paris to continue the sequence The Eiffel Tower is located in the city of.

For more on how this plot is made, see the original paper on \"causal tracing\".

The horizonal axis is what we just added with the RWKV class's blocks -- as we move from left to right in the graph, a given token is being processed by more and more blocks.

The vertical axis is \"time\", aka \"sequence length\" or the \"sequence dimension\". As we move from top to bottom, a token is being processed in the context of more and more tokens.

Each time we apply our model to a token, all of the model's layers are applied -- we move through a row of the graph.

The figure shows that the fact that Eiffel Tower is in the city of Paris arises in an early layer: observe the dark purple color in the row for the el token, beginning at layer 4/5 or so.

Put another way, the Embedding for the el token has been \"enriched\" with additional information: that jumble of floating point numbers now expresses that the noun phrase this token is part of refers to an object in the city of Paris.

Enriching tokens with information about the world from outside the sequence is done primarily by the MLP modules in a Transformer. In RWKV, that will be done by a similar module, a GatedMLP.

Later tokens in the sequence, like city, do not have this information in them -- nor should they! Despite what some Francophiles might claim, not all cities are Paris.

Instead, that information is first propagated deeper into the network: see the purple line moving from left to right.

At some point, that information does need to be transferred to later tokens -- at the very least, it needs to make it to the final token in the sequence to get added to the NextTokenProbabilities.

We can see that in roughly layer 20: follow the purple vertical line downwards from the el token to the of token. From there the information that the Eiffel Tower is in Paris propagated to the output.

Routing information across time is the responsibility of the Attention modules of a Transformer. We'll give the same name to the module that achieves the same outcome in RWKV, even though it works quite a bit more like a memory: information is added to all future embeddings.

Let's put that all together:

class RWKVBlock(torch.nn.Module):\n    \"\"\"The core \"block\" in the RWKV architecture, which updates the embedding.\"\"\"\n\n    @beartype\n    def forward(self, x: Embedding) -> Embedding:\n        # attention enriches embedding using sequence memory\n        dx: Update = self.attention(self.ln1(x))\n        x: Embedding = x + dx  # preserve inputs as much as possible\n\n        # gated MLP enriches embedding by doing computations\n        dx: Update = self.gated_mlp(self.ln2(x))\n        x: Embedding = x + dx  # again, preserve inputs\n\n        return x\n

Okay, we slipped in a layer normalization, which is important but uninteresting.

But what is an Update?

It's just a synonym for Embedding!

Update = Embedding\nUpdate\n
jaxtyping.Float64[Tensor, 'channels=1024']\n

Again, we're using the type hints for documentation -- it helps us separate which Tensors are used for what.

Now, let's define how our Block gets initialized:

def __init__(self):\n    super(RWKVBlock, self).__init__()\n    self.ln1 = torch.nn.LayerNorm(N_EMBD)\n    self.attention = AttentionBlock()\n\n    self.ln2 = torch.nn.LayerNorm(N_EMBD)\n    self.gated_mlp = GatedMLP()\n\n\nRWKVBlock.__init__ = __init__\n

The LayerNorms we recognize.

For the other layers, let's just put in placeholders, like we did for the RWKV blocks in the zero-layer model, so that we can visualize our architecture again and focus on the overall flow.

class GatedMLP(torch.nn.Identity):\n    \"\"\"Placeholder\"\"\"\n\nclass AttentionBlock(torch.nn.Identity):\n    \"\"\"Placeholder\"\"\"\n\ndisplay_graph(make_graph(RWKVBlock(), input_data=torch.zeros(N_EMBD), depth=1, graph_dir=\"TD\"))\n

Follow that arrow on the left -- it connects the input to the output with only additions.

This is a residual connection, which is also a very important feature of Transformers.

This residual connection is one reason why we could we could just rip out the entire middle of the network and still get reasonable outputs: each layer ends up just adjusting the output of the previous layer, rather than starting from scratch, so the inputs of the first block and the outputs of the last block are similar enough that the unembedding at the end can read either!

It's also important, like our normalization layers, for stabilizing optimization.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#the-gated-mlp","title":"The Gated MLP","text":"

Alright, now let's dive into the implementation of the simpler of the two components in the block, the GatedMLP.

The MLP part is pretty standard and looks like the same part of the Transformer but uses non-standard nomenclature -- the input layer's weights are called key weights and the output layer's weights are called value weights.

The other two pieces, the mixers and the gating, are less standard.

We'll define them below.

from torch import sigmoid\n\n\nclass GatedMLP(torch.nn.Module):\n    \"\"\"Applies an MLP (matrix, nonlinearity, matrix) with gated outputs.\"\"\"\n\n    @beartype\n    def forward(self, x: Embedding) -> Update:\n        # \"mix\" current input with the previous input\n        mixed_x: Embedding = self.mlp_mixer(x)\n        # put that through an MLP\n        mlp_outputs: Embedding = self.value(self.nonlinearity(self.key(mixed_x)))\n        # non-standard nomenclature, probably because of this paper https://arxiv.org/abs/2012.14913\n\n        # \"mix\" the current input with the previous input again, with different weights\n        mixed_x_receptance: Embedding = self.receptance_mixer(x)\n        # use that to calculate how \"receptive\" each dimension of embedding is to new inputs\n        receptance: Embedding = self.receptance(mixed_x_receptance)\n\n        # convert that receptance to a 0-1 value with a sigmoid\n        gating_values: Embedding = sigmoid(receptance)\n        # then use those as \"gating\" by multiplying them\n        dx: Update = gating_values * mlp_outputs\n\n        return dx\n

The receptance/gating is not present in the MLP portion of a Transformer. It's more of an RNN thing.

If you graph it, the sigmoid function is shaped like an elongated s, with the bottom left of the s at (-inf, 0) and the top-right at (inf, 1). It turns the receptances, which can be any floating point number, into multiplicative gating_values, numbers that are between 0 and 1.

When the gating_value for a channel is close to 0, the value of dx in that channel for the GatedMLP is also close to 0. Effectively, we don't Update that channel of the Embedding with the MLP's output.

Essentially, the mlp_output computation decides what might be returned, and the receptance decides whether it's returned.

Now, let's talk mixers.

At multiple points in the RWKV architecture, information from the current embedding is mixed with information from the most recent embedding.

This is important when inividual tokens in the language are not very meaningful, e.g. when you're working directly with bytes, rather than with Tokenizers that have tokens like Drosophila.

The mixers are probably not a critical feature of the architecture, but they're there and working through an implementation will help us practice handling state in PyTorch, so let's go for it!

Here's the forward -- notice how we use the last_x value as part of our calculations and assign the current_x value to that variable before we finish.

class Mixer(LoadingMixin, torch.nn.Module):\n    \"\"\"Returns a per-entry-weighted combination of current input and previous input.\"\"\"\n\n    @beartype\n    def forward(self, current_x: Embedding) -> Embedding:\n        out =  mix_embeddings(current_x, self.last_x, self.weight)\n        self.last_x: Embedding = current_x  # store for later\n        return out\n

The specific way these mixers combine embeddings is by a weighted combination.

The weights are per-channel, i.e. different dimensions of the embedding get mixed differently.

ChannelParameter = Float[torch.Tensor, f\"params={N_EMBD}\"]  # one parameter for each embedding dimension\n\n@beartype\ndef mix_embeddings(x: Embedding, y: Embedding, mixing_params: ChannelParameter) -> Embedding:\n    \"\"\"Mixes two embeddings with weights given by the mixing_params.\"\"\"\n    return x * mixing_params + y * (1 - mixing_params)\n\nmix_embeddings\n
<function __main__.mix_embeddings(x: jaxtyping.Float64[Tensor, 'channels=1024'], y: jaxtyping.Float64[Tensor, 'channels=1024'], mixing_params: jaxtyping.Float64[Tensor, 'params=1024']) -> jaxtyping.Float64[Tensor, 'channels=1024']>\n

Now, let's write an init for the Mixer class.

Handling the weights is easy enough -- we've had parameters in many of our Modules, but they've been handled for us by PyTorch, like in Linear and LayerNorm.

We just need to explicitly assign a torch.nn.Parameter to store our mixing weights.

But what about the last_x? It's not exactly a Parameter, but we still need to store it.

We can use register_buffer to store extra, non-Parameter information in our torch.nn.Module -- it's very similar to creating a Parameter, but interacts differently with gradients during training.

def __init__(self):\n    super(Mixer, self).__init__()\n    self.weight = torch.nn.Parameter(torch.zeros(N_EMBD))\n    self.register_buffer(\"last_x\", torch.zeros(N_EMBD), persistent=False)  # persistent=False means \"don't save to disk\"\n\nMixer.__init__ = __init__\n\nfor depth in range(2):\n    display_graph(make_graph(Mixer(), input_data=torch.zeros(N_EMBD), depth=depth))\n

Note that the buffers and parameters don't show up in the graph! It only shows the tensors we input or produce, not the ones we store.

Now, we can round out our GatedMLP implementation with an init:

MLP_HIDDEN_DIM = 4096  # note: 4 x N_EMBD\n\ndef __init__(self):\n    super(GatedMLP, self).__init__()\n    # again, non-standard terminology of RWKV: \"key\" is first layer of MLP, \"value\" is second\n    self.key = torch.nn.Linear(N_EMBD, MLP_HIDDEN_DIM, bias=False)\n    self.nonlinearity = SquaredReLU()  # non-standard nonlinearity\n    self.value = torch.nn.Linear(MLP_HIDDEN_DIM, N_EMBD, bias=False)\n\n    self.mlp_mixer, self.receptance_mixer = Mixer(), Mixer()\n    self.receptance = torch.nn.Linear(N_EMBD, N_EMBD, bias=False)\n\nGatedMLP.__init__ = __init__\n

Oh, one more thing, the nonlinearity in the middle of the MLP is non-standard too.

It's the usual ReLU layer, but with the output Squared:

Latents = Float[torch.Tensor, f\"latents={MLP_HIDDEN_DIM}\"]\n\nclass SquaredReLU(torch.nn.Module):\n    def forward(self, x: Latents) -> Latents:\n        return torch.nn.functional.relu(x) ** 2\n\nfor depth in range(2):\n    display_graph(make_graph(SquaredReLU(), input_data=torch.zeros(MLP_HIDDEN_DIM), depth=depth))\n

That's a complete implementation, so we can take a look at the graph.

for depth in range(2):\n    display_graph(make_graph(GatedMLP(), depth=depth, input_data=torch.zeros(N_EMBD)))\n

We can see the two mixers on the far left.

The one on the top feeds into a linear-nonlinear-linear cascade -- that's the MLP.

The one on the bottom feeds into a sigmoid before being multiplied -- that's the Gated part.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#the-attention-block","title":"The \"Attention\" Block","text":"

With that warm-up done, let's tackle the harder of the two pieces: the \"attention\" block that handles information routing over time.

For this one, let's start with the __init__.

We've got a bunch of Linear layers, which again go by the names key and value and receptance, plus one more to determine our final output.

We've also got matching Mixers for the keys, values, and receptances.

The only really new piece is the WKVMemory.

class AttentionBlock(torch.nn.Module):\n    def __init__(self):\n        super().__init__()\n\n        # linear operations\n        self.key = torch.nn.Linear(N_EMBD, N_EMBD, bias=False)\n        self.value = torch.nn.Linear(N_EMBD, N_EMBD, bias=False)\n        self.receptance = torch.nn.Linear(N_EMBD, N_EMBD, bias=False)\n        self.output = torch.nn.Linear(N_EMBD, N_EMBD, bias=False)\n\n        # mixers\n        self.key_mixer, self.value_mixer = Mixer(), Mixer()\n        self.receptance_mixer = Mixer()\n\n        # memory\n        self.memory: torch.nn.Module = WKVMemory()\n
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#the-secret-sauce-wkvmemory","title":"The secret sauce: WKVMemory","text":"

But it's a big piece!

The memory is the heart of the architecture, and it's both very different from any component of a Transformer and fairly complicated.

But, as a wise model once said, \"let's work this out in a step-by-step way to be sure we have the right answer\".

We'll start with the components, which we define in the init.

As with the mixers, we have some parameters that operate on channels and we have some persistent state to track.

class WKVMemory(torch.nn.Module):\n    \"\"\"A memory module whose contents exponentially decay over time, at a different rate per channel.\"\"\"\n    def __init__(self):\n        super().__init__()\n\n        # learned memory parameters -- one value for each dimension in the embeddings\n        self.log_gain: ChannelParameter = torch.nn.Parameter(torch.zeros(N_EMBD))\n        self.log_decay: ChannelParameter = torch.nn.Parameter(torch.zeros(N_EMBD))\n\n        # state buffers to track information across a sequence\n        contents, normalizer = torch.zeros(N_EMBD), torch.zeros(N_EMBD)\n        self.register_buffer(\"contents\", contents, persistent=False)\n        self.register_buffer(\"normalizer\", normalizer, persistent=False)\n

The names for these parameters, gain and decay, come from signal processing.

A gain is used to attenuate or amplify a signal. We'll use it only on the current embedding, so our memory can treat it specially, relative to the stored information.

A decay parameter determines the rate at which a signal attenuates over time. We'll use it on the information stored in the memory so that it goes away over time -- fading towards 0.

The memory has two pieces of state to track:

  • the contents track the information observed so far, accumulating over time

  • they're unnormalized, so we also track a normalizer for those contents.

The final \"state of\" or \"information in\" the memory is their ratio, contents / normalizer.

As part of a forwards pass, we update both, so our \"memory\" is some kind of average across time of what we've seen so far.

Here's what that looks like:

from typing import Tuple\n\nScalingWeight = Float[torch.Tensor, f\"positiveEntries={N_EMBD}\"]  # positive number, one per channel\n\n@beartype\ndef update(self, importances: ScalingWeight, values: Embedding) -> Tuple[Update, Update]:\n    \"\"\"Updates the memory by incrementing time and mixing in the weighted input values.\"\"\"\n    # decay the information currently in memory by one step\n    self.step()\n\n    # compute new information to add to the memory\n    contents_update: Update = importances * values  # scale each value by the matching importance weight\n    normalizer_update: Update = importances  # keep track of the weights so we can normalize across steps\n\n    # and then add the new information to the memory\n    self.contents += contents_update\n    self.normalizer += normalizer_update # -- including updating the normalizer!\n\n    # and return it\n    return contents_update, normalizer_update\n\n\nWKVMemory.update = update\n

Without the decay step, the ratio of contents and normalizer would be just a weighted average of past values.

That is, for each channel, we're accumulating (+=) the weighted values into the content and the weights into normalizer, and contents/normalizer is their ratio: the weighted average.

But once we include the decay step, each channel in the memory becomes an exponential moving weighted average:

from torch import exp\n\n\ndef step(self):\n    \"\"\"Pushes the information currently in the memory towards zero.\"\"\"\n    decay_rate: ScalingWeight = exp(self.log_decay)  # exp ensures that decay rate is positive\n    self.contents *= exp(-decay_rate)  # decay_rate > 0, so exp(-decay_rate) < 1\n    self.normalizer *= exp(-decay_rate)  # so each .step shrinks the contents and normalizer towards 0\n\n\nWKVMemory.step = step\n

That is, we repeatedly multiply the contents (and their normalizer!) with a number between 0 and 1, determined by our decay_rate.

If a channel had an infinitely large decay_rate, its state would just be the most recent value in that channel.

Channels with very large decay rates are common early in the network.

If it had a decay_rate of 0, the channel would go back to being a weighted average.

That allows for longer-term integration of information, and channels with very low decay rates are common later in the network.

Now let's look at the full forward pass for the memory.

It's almost as simple as

  • update the memory
  • return the memory's state, aka contents / normalizer

but there's one small complication -- the gain, which gets applied to just the most recent value.

The gain ensures that the most recent value is treated differently than all past values.

Here's what that looks like:

@beartype\ndef forward(self, values: Embedding, importances: ScalingWeight) -> Update:\n    \"\"\"Applies the RWKV \"time-mixing block\" forward pass, in the \"RNN Cell\" style.\n\n    For details, see https://arxiv.org/abs/2305.13048, Appendix B, Eqn. 19-22 and Fig. 7.\"\"\"\n    # first, we update the memory and return what we just added\n    latest_contents, latest_normalizer = self.update(importances, values)\n\n    # then, we adjust the representation of the latest information\n    latest_contents, latest_normalizer = self.apply_gain(latest_contents, latest_normalizer)\n\n    # before adding it in and dividing, to get the final thing we report as output\n    out: Update = (self.contents + latest_contents) /           \\\n                  (self.normalizer + latest_normalizer)\n\n    return  out\n\n\nWKVMemory.forward = forward\n

By the way, this is where we hit the numerical instability that requires us to use float64 in this implementation. We are taking exponents (dangerous) and dividing them (doubly dangerous).

The official implementation uses several tricks to remove this instability and allow the use of lower precision floats, but they add a lot of complexity to code that's already pretty tough to follow.

To finish out our implementation of WKVMemory, let's add the gain:

def apply_gain(self, latest_contents, latest_normalizer):\n    \"\"\"Applies the channelwise gain to the latest contents and normalizer.\"\"\"\n    gain = exp(self.log_gain) - 1  # -1 < gain < inf\n\n    boosted_contents = gain * latest_contents\n    boosted_normalizer = gain * latest_normalizer\n\n    return boosted_contents, boosted_normalizer\n\n\nWKVMemory.apply_gain = apply_gain\n

When the gain parameter for a channel is at its lowest value, -1, applying it removes the update we added. That channel is always \"one step behind\" and its output only reflects the past -- useful for spreading information across tokens.

This way of writing it is another source of numerical instability in this implementation: we add and then subtract, which is unfortunately not quite the same as doing nothing when floats are involved.

When the gain for a channel is very large, the output of the memory is always the same as the input value in that channel -- much like having a very large decay_rate.

When the gain for the channel is close to 0, the current value is treated the same as past values.

The graph representation isn't particularly helpful for the WKVMemory, because this Module doesn't have any sub-modules.

But if you look closely, you can see the memory updates. They're the add_ operations -- _ means \"in-place\" in PyTorch.

for depth in range(2):\n    display_graph(make_graph(WKVMemory(), depth=depth, input_data=(torch.zeros(N_EMBD), torch.zeros(N_EMBD))))\n

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#the-rest-of-the-attentionblock","title":"The rest of the AttentionBlock","text":"

Let's see how the memory gets incorporated into the AttentionBlock.

In short, we

  • calculate the keys and values, after running the Mixers,
  • use the exponentiated keys as weights to store the values in the memory,
  • calculate gating for our memory's output based on receptances, and finally
  • use one more Linear layer to calculate our final Update.

Which looks like this:

@beartype\ndef forward(self, x: Embedding) -> Update:\n    # as with the MLP, do mixers before anything else\n    mixed_keys = self.key_mixer(x)\n    keys: Embedding = self.key(mixed_keys)\n\n    mixed_values = self.value_mixer(x)\n    values: Embedding = self.value(mixed_values)\n\n    # wkv: apply \"w\"eighted decay to merge\n    #      current info (\"k\"eys and \"v\"alues) with past\n    wkv: Embedding = self.memory(values, exp(keys))\n\n    # decide how \"r\"eceptive each channel is to inputs\n    mixed_receptances = self.receptance_mixer(x)\n    receptances: Embedding = self.receptance(mixed_receptances)\n    gating_values = sigmoid(receptances)\n\n    # rwkv: use the \"r\"eceptances to gate the output of the \"wkv\" memory\n    rwkv: Embedding = gating_values * wkv\n\n    # and then do one final linear transform before returning it\n    dx: Update = self.output(rwkv)\n\n    return dx\n\nAttentionBlock.forward = forward\n

The graph view, below, is a helpful summary of the flow in this block.

The three Mixers-with-Linear-transformations appear first.

One is used via sigmoid-then-mul to gate the rest -- that'd be the receptances.

The other two are used in the WKVMemory -- but the keys are first exponentiated into importance weights.

display_graph(make_graph(AttentionBlock(), depth=1, input_data=torch.zeros(N_EMBD), graph_dir=\"TD\"))\n

Notice that there are no \"queries\" -- there's nothing we compare to the keys to decide which values are important.

That makes this very different from Transformer attention, which looks a lot more like a lookup from a key-value store -- so much so that you can implement it in Redis, a popular key-value database.

Instead, determining what is relevant as we proceed through the sequence, aka deciding what's worth our attention, is split into deciding what to store in memory right now and what from our memory is relevant right now.

  1. We decide what to store in our memory by calculating values and assigning them importances via the (exponentiated) keys.
  2. We decide what's relevant right now by using the receptances to filter the wkv memory.

Hence rwkv.

This is the core of what makes RWKV's inference easier on the RAM than Transformer inference: we explicitly store information from the past, rather than looking the information up from the past every time we need it!

It also makes the memory a bottleneck, which is one reason why you might suspect that a model like RWKV might not be as capable as a Transformer of the same size.

That hasn't been the case up to 14B parameters, but scale can reveal hidden issues!

That said, 14B parameters is big enough to get some pretty useful behavior out of a language model, so with RWKV, efficient-inference LMs have already secured a spot in the language modeling tech stack!

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#putting-it-all-together","title":"Putting it all together","text":"

Let's see what the graph view looks like for an entire RWKVBlock.

display_graph(make_graph(RWKVBlock(), depth=2, input_data=torch.zeros(N_EMBD), graph_dir=\"TD\"))\n

What could be simpler?

But in all seriousness: these graphs can be very helpful adjuncts to the code!

To really grok this architecture, I recommend pulling one of the graphs up in a separate window and mapping it onto the matching module's code.

Now, let's run a few layers and see if the added ability to store information about past tokens solves the Drosophila melanosporum problem.

k = 2\nshort_rwkv = RWKV.from_weights(weights, [RWKVBlock() for _ in range(k)])\nshort_rwkvlm = AutoregressiveLM(embs, short_rwkv, unembs)\nshort_rwkvlm = short_rwkvlm.eval()\n\nout = short_rwkvlm.generate(sequence=\"Drosophila\", N=2, temperature=0.0)\n
Drosophila melanog\n

Success! We're starting to get the rest of \"melanogaster\", the expected following token.

But we've got one more thing to handle: we've written how to add state to the memory and the mixers, but we haven't written any way to remove information, so our model will just accumulate information forever, and we'd need to reinitialize it if we wanted to start \"fresh\" on a new sequence.

Let's add a quick helper to clear out state:

def clear_buffers(module, verbose=False):\n    for name, buffer in module.named_buffers():\n        if verbose:\n            print(f\"clearing buffer {name}\")\n        buffer.zero_()\n\n\nAutoregressiveLM.clear_buffers = clear_buffers\nRWKV.clear_buffers = clear_buffers\nRWKVBlock.clear_buffers = clear_buffers\n
","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#running-a-real-model-rwkv-4-430m","title":"Running a \"real\" model: RWKV-4 430M","text":"

Okay, so we can run a toy model with a few layers and get three tokens in a row to make sense.

That's cool, but what about the entire RWKV-4 430M model whose weights we've been using?

Let's close our examination of RWKV inference with that!

First we initialize it:

N_LAYER = 24\nrwkv_blocks = [RWKVBlock() for _ in range(N_LAYER)]\n\nrwkv = RWKV.from_weights(weights, rwkv_blocks)\n\nrwkv4 = AutoregressiveLM(embs, rwkv, unembs)\nrwkv4 = rwkv4.eval()\n

And then we run it:

rwkv4.clear_buffers()\n\nout = rwkv4.generate(sequence=\"Drosophila\", N=8, temperature=0.0)\n
Drosophila* and *Drosophila melanogaster*\n

Interestingly, it starts adding some Markdown formatting -- scientific names are usually written Like this, which is formatted in Markdown *Like this*.

Lastly, let's confirm that the model can generate reasonable text.

More than that, let's check that it outputs the same text as the official reference!

This is the main test I used to check that my implementation was really equivalent.

rwkv4.clear_buffers()\n\nsequence = \"\"\"\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. \"\"\" + \\\n\"Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.\"\n\nrwkv4.generate(sequence, N=20, temperature=0.0);\n
In a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.\n\nThe dragons were discovered by a team of researchers from the University of California, Berkeley, who\n

Go Bears.

","tags":["llms","rwkv","code","notebook"]},{"location":"blog/posts/rwkv-explainer/#acknowledgements","title":"Acknowledgements","text":"

Thanks to Ryan Zarcone for several long {white,chalk}boarding sessions on RWKV and to Igor Vasiljevic, Faris Hijazi, Rog\u00e9rio Chaves, and Ben Field for helpful comments on drafts.

Also, many thanks to the RWKV team, in particular Johan Wind, whose blog post implementing RWKV in numpy was an invaluable resource and provided the initial scaffolding for the code in this post.

","tags":["llms","rwkv","code","notebook"]},{"location":"cloud-gpus/","title":"Cloud GPUs","text":"

By Sergey Karayev and Charles Frye. Updated October 30, 2023.

Discussion of this page on Hacker News, May 21, 2023.

Training and running neural networks often requires hardware acceleration, and the most popular hardware accelerator is the venerable graphics processing unit, or GPU.

We have assembled cloud GPU vendor pricing all into tables, sortable and filterable to your liking!

We have split the vendor offerings into two classes:

  • GPU Cloud Servers, which are long-running (but possibly pre-emptible) machines, and
  • Severless GPUs, which are machines that scale-to-zero in the absence of traffic (like an AWS Lambda or Google Cloud Function)

We welcome your help in adding more cloud GPU providers and keeping the pricing info current.

Please file an issue or make a pull request to this repo, editing this file to update the text on this page or one of the CSV files to update the data: cloud-gpus.csv for servers and serverless-gpus.csv for serverless options.

"},{"location":"cloud-gpus/#gpu-cloud-server-comparison","title":"GPU Cloud Server Comparison","text":""},{"location":"cloud-gpus/#notes","title":"Notes","text":"

The table below does not include all possible configurations for all providers, as providers differ in their configuration strategy.

  • Most providers, including AWS, Azure, and Lambda, provide instances with pre-set configurations.
  • On GCP, any suitable machine can be connected to a configuration of GPUs.
  • On other providers, like Oblivus Cloud, Cudo Compute, and RunPod, users have precise control over the resources they request. Note that RunPod's Community Cloud, Oblivus, and Cudo are all \"open clouds\", meaning compute is provided by third parties.
  • For providers without pre-set instance configurations, we have selected configurations that are roughly equivalent to AWS's options. Generally, these configurations are good for workloads that require heavy inter-GPU communication.
  • Where possible, regions were set to be the west or central parts of the United States. GPU availability depends on the region.
  • Raw data can be found in a csv on GitHub.
  • Costs can be substantially reduced via preemption recovery and failover across clouds. If you don't want to roll your own, consider a tool like SkyPilot. See discussion of their launch on Hacker News, December 13, 2022.

All prices are in $/hr.

"},{"location":"cloud-gpus/#serverless-gpus","title":"Serverless GPUs","text":""},{"location":"cloud-gpus/#notes_1","title":"Notes","text":"

We use the classic definition of \"serverless\", courtesy of the original AWS announcement on serverless computing: no server management, flexible scaling, high availability, and no idle capacity. We only include services that fit this criterion in our options below.

Furthermore, we only include services that provide serverless GPUs, which can be used to run custom workloads, not just inference in particular models as a service.

  • Direct price comparisons are trickier for serverless offerings: cold boot time and autoscaling logic can substantially impact cost-of-traffic.
  • Some of the providers allow configuration of CPU and RAM resources. We have selected reasonable defaults, generally comparable to the fixed offerings of other providers.
  • You can find pricing pages for the providers here: Banana, Baseten, Beam, Modal, Replicate, RunPod
  • Serverless GPUs are a newer technology, so the details change quickly and you can expect bugs/growing pains. Stay frosty!
  • Raw data can be found in a csv on GitHub.

All prices are in $/hr.

"},{"location":"cloud-gpus/#how-do-i-choose-a-gpu","title":"How do I choose a GPU?","text":"

This page is intended to track and make explorable the current state of pricing and hardware for cloud GPUs.

If you want advice on which machines and cards are best for your use case, we recommend Tim Dettmer's blog post on GPUs for deep learning.

The whole post is a tutorial and FAQ on GPUS for DNNs, but if you just want the resulting heuristics for decision-making, see the \"GPU Recommendations\" section, which is the source of the chart below.

Flowchart for quickly selecting an appropriate GPU for your needs, by Tim Dettmers

"},{"location":"cloud-gpus/#gpu-raw-performance-numbers-and-datasheets","title":"GPU Raw Performance Numbers and Datasheets","text":"

Below are the raw TFLOPs of the different GPUs available from cloud providers.

Model Arch FP32 Mixed-precision FP16 Source A100 Ampere 19.5 156 312 Datasheet A10G Ampere 35 35 70 Datasheet A6000 Ampere 38 ? ? Datasheet V100 Volta 14 112 28 Datasheet T4 Turing 8.1 65 ? Datasheet P4 Pascal 5.5 N/A N/A Datasheet P100 Pascal 9.3 N/A 18.7 Datasheet K80 Kepler 8.73 N/A N/A Datasheet A40 Ampere 37 150 150 Datasheet"},{"location":"cloud-gpus/#gpu-performance-benchmarks","title":"GPU Performance Benchmarks","text":"

Below are some basic benchmarks for GPUs on common deep learning tasks.

Benchmark of different GPUs on a single ImageNet epoch, by AIME

Benchmark of different GPUs on a mix of tasks, by Lambda Labs

"},{"location":"conference/","title":"FSDL 2023 Conference","text":"

Products are built by people, and people build best when they build together.

So we're bringing together some of the best builders of ML-powered products to share their hard-won knowledge from the trenches, make professional and social connections, and celebrate all the amazing technologies of the last year and years to come.

We invite you to join us, virtually or in-person in San Francisco, for an all-day conference on April 22, 2023.

"},{"location":"conference/#we-are-excited-to-announce-the-fsdl-2023-conference","title":"\ud83d\ude80 We are excited to announce the FSDL 2023 Conference \ud83d\ude80","text":"

Register by April 15th!

"},{"location":"conference/#confirmed-speakers","title":"Confirmed Speakers","text":"Charles Frye teaches people on the internet. He worked in education and growth at Weights & Biases after getting a PhD in Neuroscience at UC Berkeley. He now works as a consultant, including for Gantry. Sergey Karayev is Co-founder of Volition. He co-founded Gradescope after getting a PhD in Computer Vision at UC Berkeley. Josh Tobin is Co-founder and CEO of Gantry. He worked as a Research Scientist at OpenAI and received a PhD in AI at UC Berkeley."},{"location":"conference/#where","title":"Where","text":"

The in-person event will be at the South San Francisco Conference Center.

Links and information required to join the event virtually will be delivered via e-mail.

"},{"location":"conference/#when","title":"When","text":"

The event will run all day on Saturday, April 22, 2023.

"},{"location":"conference/#register","title":"Register","text":"Online $0.00
  • \ud83d\udcfa \u00a0 Watch the talks live online
  • \ud83d\udcf9 \u00a0 Review recordings afterwards RSVP In-Person $350
    • \ud83d\udde3 \u00a0 Watch the talks live in real life
    • \ud83e\udd1d \u00a0 Network with builders and vendors in ML
    • \ud83d\udc55 \u00a0 Get some conference swag
    • \ud83e\udd5e \u00a0 Eat food
    • \ud83d\udd1c \u00a0 Register by April 15th! Register here"},{"location":"conference/#sponsors","title":"Sponsors","text":"

      We're currently looking for organizations that are interested in sponsoring this event!

      See this page for details and contact sponsorships@fullstackdeeplearning.com with inquiries.

      "},{"location":"conference/sponsors/","title":"Info for Sponsors","text":"

      We offer three tiers of sponsorship for the FSDL 2023 Conference:

      1. Vector Tier sponsors receive the following benefits: logo displayed on website and during virtual conference, verbal acknowledgement , and one registration. Vector Tier sponsorships are available for $500.

      2. Matrix Tier sponsors receive all the benefits of the Vector Tier, plus: logo displayed in between talks, 8' by 10' area for booth setup, and an additional registration (for a total of two). Matrix Tier sponsorships are available for $1500.

      3. Tensor Tier sponsors receive all the benefits of the Matrix Tier, plus: logo displayed on a banner in the registration area, access to an opt-in database of attendees who are interested in tooling or recruitment, and two additional registrations (for a toral of four). Tensor Tier sponsorships are available for $2500.

      Contact sponsorships@fullstackdeeplearning.com if you're interested in sponsoring the conference!

      "},{"location":"course/","title":"Full Stack Deep Learning Courses","text":"

      The Full Stack Deep Learning course started in 2018, as a three-day bootcamp hosted on Berkeley campus. Since then, we've hosted several in-person bootcamps, online courses, and official university courses.

      Looking for the most recent FSDL materials?

      You can find them here.

      "},{"location":"course/#testimonials","title":"Testimonials","text":""},{"location":"course/#past-iterations","title":"Past Iterations","text":"
      • FSDL 2022 (Online): A fully online course, taught via YouTube, Crowdcast, and Discord.
      • FSDL 2021 (Online): Contemporaneous with the Berkeley course, we taught an online cohort course.
      • FSDL 2021 (Berkeley): Taught as a UC Berkeley undergrad course CS194-080 in Spring 2021
      • FSDL 2020 (UW): Taught as University of Washington Professional Master's Program course CSEP 590C in Spring 2020
      • FSDL 2019 (Online): Materials from the November 2019 bootcamp held on Berkeley campus organized in a nice online format.
      • FSDL 2019 (Bootcamp): Raw materials from the March 2019 bootcamp, held on Berkeley campus.
      • FSDL 2018 (Bootcamp): Our first bootcamp, held on Berkeley campus in August 2018
      "},{"location":"course/2022/","title":"Full Stack Deep Learning - 2022 Course","text":"

      Course Completed

      All the lecture and lab material is free forever. Just check out the links below.

      To be among the first to hear about future iterations of the course, simply enter your email below, follow us on Twitter, or subscribe to our YouTube channel.

      "},{"location":"course/2022/#schedule","title":"Schedule","text":"

      We released lecture videos on Mondays at 6pm Pacific and lab videos on Wednesdays at 6pm Pacific on YouTube.

      Week Lecture Lab Project 2022.07.25 Pre-Labs 1-3: CNNs, Transformers, and PyTorch+Lightning - 2022.08.08 Lecture 1: Course Vision and When to Use ML Lab Overview - 2022.08.15 Lecture 2: Development Infrastructure & Tooling Lab 4: Experiment Management - 2022.08.22 Lecture 3: Troubleshooting & Testing Lab 5: Troubleshooting & Testing - 2022.08.29 Lecture 4: Data Management Lab 6: Data Annotation Start forming groups 2022.09.05 Lecture 5: Deployment Lab 7: Web Deployment Group proposals due 2022.09.12 Lecture 6: Continual Learning Lab 8: Model Monitoring Work on project 2022.09.19 Lecture 7: Foundation Models Work on project 2022.09.26 Lecture 8: ML Teams and Project Management Work on project 2022.10.03 Lecture 9: Ethics Work on project 2022.10.10 Project Presentations Project due"},{"location":"course/2022/#detailed-contents","title":"Detailed Contents","text":""},{"location":"course/2022/#pre-labs-1-3-cnns-transformers-pytorch-lightning","title":"Pre-Labs 1-3: CNNs, Transformers, PyTorch Lightning","text":"

      We review some prerequisites -- the DNN architectures we'll be using and basic model training with PyTorch -- and introduce PyTorch Lightning. Published August 10, 2022.

      "},{"location":"course/2022/#lecture-1-course-vision-and-when-to-use-ml","title":"Lecture 1: Course Vision and When to Use ML","text":"

      We review the purpose of the course and consider when it's a good (or bad!) idea to use ML. Published August 8, 2022.

      "},{"location":"course/2022/#lab-overview","title":"Lab Overview","text":"

      We walk through the entire architecture of the application we will be building, from soup to nuts. Published July 25, 2022.

      "},{"location":"course/2022/#lecture-2-development-infrastructure-tooling","title":"Lecture 2: Development Infrastructure & Tooling","text":"

      We tour the landscape of infrastructure and tooling for developing deep learning models. Published August 15, 2022.

      "},{"location":"course/2022/#lab-4-experiment-management","title":"Lab 4: Experiment Management","text":"

      We run, track, and manage model development experiments with Weights & Biases. Published August 17, 2022.

      "},{"location":"course/2022/#lecture-3-troubleshooting-testing","title":"Lecture 3: Troubleshooting & Testing","text":"

      We look at tools and practices for testing software in general and ML models in particular. Published August 22, 2022.

      "},{"location":"course/2022/#lab-5-troubleshooting-testing","title":"Lab 5: Troubleshooting & Testing","text":"

      We try out some Python testing tools and dissect a PyTorch trace to learn performance troubleshooting techniques. Published August 24, 2022.

      "},{"location":"course/2022/#lecture-4-data-management","title":"Lecture 4: Data Management","text":"

      We look at sourcing, storing, exploring, processing, labeling, and versioning data for deep learning. Published August 29, 2022.

      "},{"location":"course/2022/#lab-6-data-annotation","title":"Lab 6: Data Annotation","text":"

      We spin up a data annotation server and learn just how messy data really is. Published August 31, 2022.

      "},{"location":"course/2022/#lecture-5-data-management","title":"Lecture 5: Data Management","text":"

      We do a lightning tour of all the ways models are deployed and do a deep dive on running models as web services. Published September 5, 2022.

      "},{"location":"course/2022/#lab-7-web-deployment","title":"Lab 7: Web Deployment","text":"

      We create and deploy our ML-powered text recognition application with a simple web UI and a serverless model service. Published September 7, 2022.

      "},{"location":"course/2022/#lecture-6-continual-learning","title":"Lecture 6: Continual Learning","text":"

      We consider what it takes to build a continual learning system around an ML-powered application. Published September 12, 2022.

      "},{"location":"course/2022/#lab-8-model-monitoring","title":"Lab 8: Model Monitoring","text":"

      We add user feedback to our ML application and review data logged by actual users of the FSDL Text Recognizer. Published September 14, 2022.

      "},{"location":"course/2022/#lecture-7-foundation-models","title":"Lecture 7: Foundation Models","text":"

      We look at how to build on GPT-3, CLIP, StableDiffusion, and other large models. Published September 19, 2022.

      "},{"location":"course/2022/#lecture-8-ml-teams-and-project-management","title":"Lecture 8: ML Teams and Project Management","text":"

      We look at the structure of ML teams and projects, including how to hire or get hired on an ML team and how to build an ML-first organization. Published September 26, 2022.

      "},{"location":"course/2022/#lecture-9-ethics","title":"Lecture 9: Ethics","text":"

      We consider ethical concerns around buiding technlogy, building with machine learning, and building artificial intelligence. Published October 3, 2022.

      "},{"location":"course/2022/#teaching-assistants","title":"Teaching Assistants","text":"

      This course was only possible with the support of our amazing TAs (in alphabetical order):

      • Andrew Mendez is a Deep Learning Solutions Engineer at DeterminedAI, working on computer vision and NLP solutions for defense and autonomous vehicle companies. Previously Andrew worked as an ML Engineer at Clarifai and CACI.
      • Daniel Hen is a Senior Data Scientist at Digital Turbine, working on Ad Tech and mobile solutions, as well as Big Data problems. Working with Spark, ML algorithms such as XGBoost, Computer Vision, and constantly learning new technology.
      • James Le runs Data Relations and Partnerships at Superb AI, a data management platform for computer vision use cases. Outside work, he writes data-centric blog posts, hosts a data-focused podcast, and organizes in-person events for the data community.
      • Saurabh Bipin Chandra is a Senior ML Scientist at Turnitin.
      • Sayak Paul is as a Machine Learning Engineer at Carted on NLP and representation learning from HTML webpages. Besides work, he contributes to various open-source projects.
      • Vishnu Rachakonda is a Data Scientist at firsthand.
      "},{"location":"course/2022/announcement/","title":"Full Stack Deep Learning 2022 Course Announcement","text":"

      Info

      Looking for the latest edition of the course? Click here.

      Want to be among the first to hear about future iterations of the course? Simply enter your email below, follow us on Twitter, or subscribe to our YouTube channel.

      Email Address

      "},{"location":"course/2022/announcement/#what-you-will-learn","title":"What you will learn","text":"

      Our course incorporates feedback from thousands of learners over 5 years of teaching production machine learning.

      We've updated all materials with best practices as of 2022 and added some brand new topics:

      • Formulating the problem and estimating project cost
      • Sourcing, cleaning, processing, labeling, synthesizing, and augmenting data
      • Picking the right framework and compute infrastructure
      • Troubleshooting training and ensuring reproducibility
      • Deploying the model at scale
      • \u2728 Monitoring and continually improving the deployed model \u2728
      • \u2728 How ML teams work and how to manage ML projects \u2728
      • \u2728 Building on Large Language Models and other Foundation Models \u2728

      "},{"location":"course/2022/announcement/#labs","title":"Labs","text":"

      Through the first few weeks of the course, you will construct an end-to-end ML system, with best practices as of 2022.

      • Applying CI/CD principles for clean, fast-moving ML
      • Hypermodern training: PyTorch + Lightning + W&B
      • Hypermodern deployment: Docker + AWS Lambda + Gradio
      "},{"location":"course/2022/announcement/#project","title":"Project","text":"

      Once the labs are done, you'll begin working on your own portfolio project. You'll create a working a working ML-powered application of your choice and share it with your fellow learners, getting feedback from course staff.

      Selected projects will get the opportunity to share their work with the broader FSDL community.

      "},{"location":"course/2022/announcement/#who-is-this-for","title":"Who is this for","text":"

      ML Researchers and Engineers, MS students, software engineers looking to get into ML, data scientists looking to up their software engineering game, and PMs on ML teams will all benefit from materials in our course.

      You will get the most out of this course if you have:

      • At least one year of experience programming in Python.
      • At least one deep learning course (at a university or online).
      • Experience with code versioning, Unix environments, and software engineering.
      • At least four hours a week to commit to learning, split across lectures, Q&A, labs, reading, and project work.

      We review the fundamentals of deep learning (backprop, MLPs, CNNs, Transformers) in supplementary lectures released before the start of the course \u2014 but you should not expect to learn this material for the first time from these.

      "},{"location":"course/2022/announcement/#testimonials","title":"Testimonials","text":""},{"location":"course/2022/announcement/#instructors","title":"Instructors","text":"

      Charles Frye teaches people on the internet. He worked in education and growth at Weights & Biases after getting a PhD in Neuroscience at UC Berkeley. He now works as a consultant, including for Gantry.

      Sergey Karayev is Co-founder of Volition. He co-founded Gradescope after getting a PhD in Computer Vision at UC Berkeley.

      Josh Tobin is Co-founder and CEO of Gantry. He worked as a Research Scientist at OpenAI and received a PhD in AI at UC Berkeley.

      "},{"location":"course/2022/announcement/#registration","title":"Registration","text":"On-Your-Own $0.00 50% discount for students
      • \ud83d\udcf9 \u00a0 Lecture and lab videos Sign Up for Mailing List Cohort-Based $495 50% discount for students and academics
        • \ud83d\udcf9 \u00a0 Lecture and lab videos
        • \ud83d\udde3 \u00a0 Private Discord with instructors
        • \u2753 \u00a0 Weekly live online Q&A
        • \ud83d\udcb2 \u00a0 Compute credits
        • \ud83c\udfa8 \u00a0 Team-based portfolio project with feedback
        • \ud83c\udfc6 \u00a0 Certificate of completion
        • \ud83d\udd1c \u00a0 Starts August 8th, 2022 Registration Closed

          Bummed that you missed registration for the 2022 course?

          To make sure you hear about the next iteration, enter your email below to sign up for our mailing list.

          "},{"location":"course/2022/announcement/#cohort-based-course-faqs","title":"Cohort-Based Course FAQs","text":"

          Read more about the 2022 synchronous cohort here.

          "},{"location":"course/2022/announcement/#how-long-is-the-course-how-much-time-do-i-need-to-devote-to-it","title":"How long is the course? How much time do I need to devote to it?","text":"

          The course is 10 weeks long, from August 8, 2022 to October 14, 2022. During that time, you should expect to spend about four hours a week on the course. In the last four weeks, during which you'll be working on the project, devoting more time will result in a final project that you can be even more proud to put in your portfolio.

          "},{"location":"course/2022/announcement/#which-components-of-the-course-are-synchronous-what-days-and-times-will-synchronous-events-occur","title":"Which components of the course are synchronous? What days and times will synchronous events occur?","text":"

          The Q&A sessions are synchronous. Their timing will be set based on student availability.

          Lectures are pre-recorded, with the Q&A session serving as a live \"flipped classroom\" for discussing the lecture content.

          Labs are available online, with a pre-recorded walkthrough video. We encourage students to form study groups, via the Discord server, for working on the lab material.

          "},{"location":"course/2022/announcement/#how-do-the-projects-work","title":"How do the projects work?","text":"

          Teams of five will work for four weeks to build an ML-powered application of their choosing: a chatbot, an image editor, a natural language speech interface, or anything else you can imagine building with ML.

          At the end, they will present their work to the rest of the class.

          Project proposals and final projects will be reviewed by course staff. We'll provide detailed feedback, but grading will be done based on completion. The best projects will be featured by FSDL on YouTube, Twitter, and at our summits.

          Questions about registration or the class that aren't answered here?

          Tweet at us @full_stack_dl or email admin@fullstackdeeplearning.com.

          "},{"location":"course/2022/cohort/","title":"Synchronous Cohort Option","text":"

          We organized the synchronous cohort for the 2022 course via Discord.

          "},{"location":"course/2022/cohort/#its-project-time-how-do-i-access-my-free-gpus","title":"It's project time! How do I access my free GPUs?","text":"

          Check out the instructions here.

          "},{"location":"course/2022/cohort/#how-do-i-know-if-im-fully-registered","title":"How do I know if I'm fully registered?","text":"

          If you completed your registration for the course, you should be able to access the #course-announcements channel here.

          Please email us at registration@fullstackdeeplearning.com if you registered for the course but cannot access the Discord.

          "},{"location":"course/2022/cohort/#ive-registered-and-im-in-the-discord-how-do-i-follow-along","title":"I've registered and I'm in the Discord. How do I follow along?","text":"

          For a quick list of the most important links for cohort members, see this Discord post. You can add the private course calendar linked there to your own so you never miss an event.

          For a tour of the Discord, plus some tips and tricks, check out the video below:

          "},{"location":"course/2022/cohort/#how-do-i-control-my-notifications-in-discord","title":"How do I control my notifications in Discord?","text":"

          To stay apprised of updates without getting overwhelmed, we suggest these notification settings:

          You can find them here in the Discord web client:

          "},{"location":"course/2022/cohort/#only-course-staff-can-use-the-remaining-notification-stream-mentions","title":"Only course staff can use the remaining notification stream, @mentions.","text":"

          If you turn off @role/@everyone/@here, you won't see announcements about live events and changes of schedule.

          If you want to further reduce distractions, turn off the notification dot.

          You can also mute individual channels, leaving only #course-announcements off mute.

          "},{"location":"course/2022/cohort/#only-instructors-can-post-in-the-course-announcements-channel","title":"Only instructors can post in the #course-announcements channel.","text":"

          If you furthermore hide muted channels, the Discord interface will only show that channel, but you'll miss out on the chance to discover discussions in other channels.

          "},{"location":"course/2022/lab-0-overview/","title":"Lab Overview","text":"

          By Charles Frye. Published July 25, 2022.

          "},{"location":"course/2022/lab-0-overview/#what-are-these-labs-for","title":"What are these labs for?","text":"

          In the lab portion of Full Stack Deep Learning 2022, we will incrementally develop a complete codebase to train a deep neural network to recognize characters in hand-written paragraphs and deploy it inside a simple web application.

          These labs act as an opportunity to work through the nitty-gritty details that come up when implementing some of the recommendations given in the lectures in a concrete system. It's also a chance for you to gain familiarity with some of the tools we recommend in the lectures.

          This lab reviews the overall architecture of the system.

          "},{"location":"course/2022/lab-0-overview/#architecture-of-the-text-recognizer","title":"Architecture of the Text Recognizer","text":"

          Software architectures are inherently about trade-offs: decisions that make for better scaling might make for worse security or tools that encourage faster iteration might reduce transparency.

          We design our architecture with agility and simplicity as the prime directives. We choose simplicity in order to empower individuals to understand the \"full stack\" of the application, from GPUs crunching tensors in model development up to serverless cloud functions acting on requests in production. And we choose agility so that individual is able to quickly iterate on the application, especially in response to user feedback.

          "},{"location":"course/2022/lab-0-overview/#architecture-diagram","title":"Architecture Diagram","text":"

          We put together a handy architecture diagram summarizing the application here:

          For a guided tour of this architecture, watch the video at the top of the page or click the badge below to open an interactive Jupyter notebook on Google Colab:

          "},{"location":"course/2022/lab-0-overview/#running-the-labs","title":"Running the labs","text":""},{"location":"course/2022/lab-0-overview/#one-click-setup-on-colab","title":"One-click setup on Colab","text":"

          To make it as easy as possible to run the labs, we've made them compatible with Google Colab.

          Wherever you see an \"Open in Colab\" badge, like the one below, just click on it and you'll be dropped into a hosted notebook environment for the lab, complete with free GPU. The badge below opens the first main-track lab, Lab 4 on experiment management.

          You can read more here.

          "},{"location":"course/2022/lab-0-overview/#setup-on-your-own-linux-machine","title":"Setup on your own Linux machine","text":"

          If you have a Linux machine with an NVIDIA GPU and drivers, either locally or in the cloud, you can also run the labs there. The video above and text instructions here should be enough to get you going.

          Don't get stuck on setup!

          Remember that Google Colab is always there as an option if you run into issues while setting up.

          Rather than getting frustrated with some obnoxious library linking or driver issue that's irrelevant to the material you are really trying to learn and getting stuck in an installation quagmire, just run the labs on Colab so you can get back to learning about machine learning!

          "},{"location":"course/2022/lab-4-experiment-management/","title":"Lab 4: Experiment Management","text":"

          By Charles Frye. Published August 17, 2022.

          In this lab, we'll work through an entire experiment management workflow for model development, using a tool called Weights & Biases.

          "},{"location":"course/2022/lab-4-experiment-management/#outline","title":"Outline","text":"
          • 00:00 Why do we need experiment management?
          • 02:24 Tracking experiments with TensorBoard
          • 04:16 Experiment management with Weights & Biases
          • 06:48 A guided tour of the W&B run interface
          • 12:12 Exploratory data analysis with W&B Tables
          • 14:00 Project management with W&B
          • 16:27 Artifact versioning with W&B
          • 18:52 Programmatic API access to W&B
          • 20:14 Collaboration tools in W&B
          • 25:00 Hyperparameter sweeps in W&B
          • 28:15 Overview of exercises

          Wait, what happened to labs 1 through 3?

          The first three labs review some pre-requisites for the course -- DNN architectures and the basics of model training.

          You can find them here.

          If you're already basically familiar with training neural networks in any framework, you really only need to review Lab 02a, on using PyTorch Lightning.

          "},{"location":"course/2022/lab-5-troubleshooting-and-testing/","title":"Lab 5: Troubleshooting & Testing","text":"

          By Charles Frye. Published August 24, 2022.

          In this lab, we'll check out the basic tools required to write clean Python code and see how to write memorization tests for training code in PyTorch Lightning. Then we'll take a deep dive into the trace of a PyTorch training step and use it to debug performance issues in GPU-accelerated code.

          "},{"location":"course/2022/lab-5-troubleshooting-and-testing/#outline","title":"Outline","text":"
          • 00:00 Overview
          • 00:51 Linting: pre-commit, black, flake8
          • 05:42 Testing: pytest, doctest, memorization testing
          • 11:15 Troubleshooting PyTorch performance
          • 16:13 A guided tour of a PyTorch trace
          "},{"location":"course/2022/lab-6-data-annotation/","title":"Lab 6: Data Annotation","text":"

          By Charles Frye. Published August 31, 2022.

          In this lab, we'll see how raw data becomes useful data via data annotation and how structured data stored on disk becomes neural network-ready with preprocessing and PyTorch Datasets.

          We'll also spin up a data annotation server using Label Studio.

          "},{"location":"course/2022/lab-6-data-annotation/#outline","title":"Outline","text":"
          • 00:00 Overview
          • 00:36 Loading annotated data and synthesizing data
          • 02:39 Setting up a data annotation server with Label Studio
          • 06:54 Uploading data to Label Studio
          • 09:15 Building and using an annotation interface in Label Studio
          • 13:17 Exercises
          "},{"location":"course/2022/lab-7-web-deployment/","title":"Lab 7: Web Deployment","text":"

          By Charles Frye. Published September 7, 2022.

          In this lab, we'll take the leap from ML model to ML-powered application by packaging our text recognizer into a portable TorchSript binary, wrapping that binary up into a serverless cloud function, and building a simple UI in Python with gradio.

          "},{"location":"course/2022/lab-7-web-deployment/#outline","title":"Outline","text":"
          • 00:00 Overview
          • 01:06 Compiling the model to TorchScript
          • 06:00 Why not deploy on GPUs?
          • 08:58 Building a GUI with gradio
          • 15:34 Spinning up a model service
          • 21:11 Creating a public URL with ngrok
          • 24:52 Writing a Dockerfile for our server
          • 30:06 Recap
          "},{"location":"course/2022/lab-8-model-monitoring/","title":"Lab 8: Model Monitoring","text":"

          By Charles Frye. Published September 14, 2022.

          In this lab, we'll add flagging to our ML-powered application so that users can give us feedback.

          Then, we'll explore some data logged based on feedback from actual users of the FSDL Text Recognizer to the model monitoring and continual learning platform Gantry.

          "},{"location":"course/2022/lab-8-model-monitoring/#outline","title":"Outline","text":"
          • 00:00 Basic user feedback with gradio
          • 04:51 Logging feedback to Gantry
          • 08:34 Checking for model toxicity with Gantry projections
          • 14:23 Detecting model bugs in the Gantry UI with distributions and filters
          • 19:01 Discovering surprising user data in the Gantry UI
          • 29:53 Outro
          "},{"location":"course/2022/labs-1-3-cnns-transformers-pytorch-lightning/","title":"Pre-Labs 1-3: CNNs, Transformers, PyTorch Lightning","text":"

          By Charles Frye. Published August 10, 2022.

          This first set of \"review\" labs covers deep learning fundamentals and introduces two of the core libraries we will use for model training: PyTorch and PyTorch Lightning.

          These labs are optional -- it's possible to get most of the value out of the main set of labs without detailed knowledge of the material here.

          But if you find yourself getting tripped up on PyTorch or Lightning details or on the model architecture, come back and review these labs.

          The videos above give a quick high-level overview of each lab.

          If you need a refresher or a deeper dive on any of the topics or libraries, then work through the notebooks as well. We recommend only completing the exercises for the labs that are of most interest to you.

          "},{"location":"course/2022/labs-1-3-cnns-transformers-pytorch-lightning/#click-the-badges-below-to-access-individual-lab-notebooks-on-colab-and-videos-on-youtube","title":"Click the badges below to access individual lab notebooks on Colab and videos on YouTube","text":"Lab Colab Video Lab 01: Deep Neural Networks in PyTorch Lab 02a: PyTorch Lightning Lab 02b: Training a CNN on Synthetic Handwriting Data Lab 03: Transformers and Paragraphs"},{"location":"course/2022/labs-1-3-cnns-transformers-pytorch-lightning/#running-the-labs","title":"Running the labs","text":""},{"location":"course/2022/labs-1-3-cnns-transformers-pytorch-lightning/#one-click-setup-on-colab","title":"One-click setup on Colab","text":"

          To make it as easy as possible to run the labs, we've made them compatible with Google Colab.

          Wherever you see an \"Open in Colab\" badge, like the one below, just click on it and you'll be dropped into a hosted notebook environment for the lab, complete with free GPU. The badge below opens Lab 01, on PyTorch.

          You can read more here.

          "},{"location":"course/2022/labs-1-3-cnns-transformers-pytorch-lightning/#setup-on-your-own-linux-machine","title":"Setup on your own Linux machine","text":"

          If you have a Linux machine with an NVIDIA GPU and drivers, either locally or in the cloud, you can also run the labs there. The video above and text instructions here should be enough to get you going.

          Don't get stuck on setup!

          Remember that Google Colab is always there as an option if you run into issues while setting up.

          Rather than getting frustrated with some obnoxious library linking or driver issue that's irrelevant to the material you are really trying to learn and getting stuck in an installation quagmire, just run the labs on Colab so you can get back to learning about machine learning!

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/","title":"Lecture 1: Course Vision and When to Use ML","text":"

          Lecture by Josh Tobin. Notes by James Le and Vishnu Rachakonda. Published August 8, 2022. Download slides.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#1-course-vision","title":"1 - Course Vision","text":""},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#history-of-fsdl","title":"History of FSDL","text":"

          Full Stack Deep Learning (FSDL) is the course and community for people who are building products that are powered by machine learning (ML). It's an exciting time to talk about ML-powered products because ML is rapidly becoming a mainstream technology - as you can see in startup funding, job postings, and continued investments of large companies.

          FSDL was originally started in 2018 when the most exciting ML-powered products were built by the biggest companies. However, the broader narrative in the field was that very few companies could get value out of this technology.

          Now in 2022, there's a proliferation of powerful products that are powered by ML. The narrative has shifted as well: There's standardization that has emerged around the tech stack - with transformers and NLP starting to seep their way into more use cases, as well as practices around how to apply ML technologies in the world. One of the biggest changes in the field in the past four years has been the emergence of the term MLOps.

          In addition to the field being more mature and research continuing to progress, a big reason for this rapid change is that the training of models is starting to become commoditized.

          • With tools like HuggingFace, you can deploy a state-of-the-art NLP or CV model in one or two lines of code.

          • AutoML is starting to work for a lot of applications.

          • Companies like OpenAI are starting to provide models as a service where you don't even have to download open-source packages to use them. You can make a network call to get predictions from a state-of-the-art model.

          • Many libraries are starting to standardize around frameworks like Keras and PyTorch Lightning.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#ai-progress","title":"AI Progress","text":"

          The history of ML is characterized by stratospheric rises and meteoric falls of the public perception of the technology. These were driven by a few different AI winters that happened over the history of the field - where the technology didn't live up to its hype. If you project forward a few years, what will happen to ML?

          Source: 5 Things You Should Know About AI (Cambridge Consultants, May 2017)

          Here are the major categories of possible outcomes and our guess about their likelihoods:

          1. A true AI winter, where people become skeptical about AI as a technology. We think this is less likely.

          2. A slightly more likely outcome is that the overall luster of the technology starts to wear off, but specific applications are getting a ton of value out of it.

          3. The upside outcome for the field is that AI continues to accelerate rapidly and becomes pervasive and incredibly effective.

          Our conjecture is that: The way we, as a field, avoid an AI winter is by translating research progress into real-world products. That's how we avoid repeating what has happened in the past.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#ml-powered-products-require-a-different-process","title":"ML-Powered Products Require a Different Process","text":"

          Building ML-powered products requires a fundamentally different process in many ways than developing ML models in an academic setting.

          In academia, you build \"flat-earth\" ML - selecting a problem, collecting data, cleaning and labeling the data, iterating on model development until you have a model that performs well on the dataset collected, evaluating that model, and writing a report at the end.

          But ML-powered products require an outer loop where after you deploy the model into production, you measure how that model performs when it interacts with real users. Then, you use real-world data to improve your model, setting up a data flywheel that enables continual improvement.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#this-course","title":"This Course","text":"

          This class is about the unique aspects you need to know beyond training models to build great ML-powered products. Here are some concrete goals for us:

          1. Teaching you generalist skills and an understanding of the components of ML-powered products (and ML projects more generally).

          2. Teaching you enough MLOps to get things done.

          3. Sharing best practices and explaining the motivation behind them.

          4. Learning things that might help you with job interviews for ML engineering roles.

          5. Forming a community to learn together and from each other.

          We do NOT try to:

          1. Teach you ML or software engineering from scratch.

          2. Cover the whole breadth of deep learning techniques.

          3. Make you an expert in any single aspect of ML.

          4. Do research in deep learning.

          5. Cover the full spectrum of MLOps.

          If you feel rusty on your pre-requisites but want to get started with FSDL, here are our recommendations to get up to speed with the fundamentals:

          • Andrew Ng's Machine Learning Coursera course

          • Google's crash course on Machine Learning

          • MIT's The Missing Semester on software engineering

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#ml-powered-products-vs-mlops","title":"ML-Powered Products vs MLOps","text":"

          MLOps, as a discipline, has emerged in just the last few years. It is about practices for deploying, maintaining, and operating ML systems that generate ML models in production. A lot of MLOps is about:

          • How do we put together an infrastructure that allows us to build models in a repeatable and governable way?

          • How can we run ML systems in a potentially high-scale production setting?

          • How can we collaborate on these systems as a team?

          ML-powered product building is a distinct but overlapping discipline. A lot of what it takes to build a great ML-powered product goes beyond the infrastructure side of ML systems. It focuses on how to fit ML into the context of the product or the application that you're building.

          Other topics in the scope of the ML product discipline include:

          • How do you understand how your users are interacting with your model?

          • How do you build a team or an organization that can work together effectively on ML systems?

          • How do you do product management in the context of ML?

          • What are the best practices for designing products that use ML as part of them?

          This class focuses on teaching you end-to-end what it takes to get a product out in the world that uses ML and will cover aspects of MLOps that are most critical in order to do that.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#chapter-summary","title":"Chapter Summary","text":"
          1. ML-powered products are going mainstream thanks to the democratization of modeling.

          2. However, building great ML-powered products requires a different process from building models.

          3. Full-Stack Deep Learning is here to help!

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#2-when-to-use-machine-learning","title":"2 - When To Use Machine Learning","text":""},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#when-to-use-ml-at-all","title":"When to Use ML At All","text":"

          ML projects have a higher failure rate than software projects in general. One reason that's worth acknowledging is that for many applications, ML is fundamentally still research. Therefore, we shouldn't aim for 100% success.

          Additionally, many ML projects are doomed to fail even before they are undertaken due to a variety of reasons:

          1. They are technically infeasible or poorly scoped.

          2. They never make the leap to a production environment.

          3. The broader organization is not all on the same page about what would be considered success criteria for them.

          4. They solve the problem that you set out to solve but do not solve a big enough problem to be worth their complexity.

          The bar for your ML projects should be that their value must outweigh not just the cost of developing them but also the additional complexity that these ML systems introduce to your software (as introduced in the classic paper \"The High-Interest Credit Card of Technical Debt\").

          In brief, ML systems erode the boundaries between other systems, rely on expensive data dependencies, are commonly plagued by system design anti-patterns, and are subject to the instability of the external world.

          Before starting an ML project, ask yourself:

          1. Are you ready to use ML? More specifically, do you have a product? Are you collecting data and storing it in a sane way? Do you have the right people?

          2. Do you really need ML to solve this problem? More specifically, do you need to solve the problem at all? Have you tried using rules or simple statistics to solve the problem?

          3. Is it ethical to use ML to solve this problem? We have a whole lecture about ethics!

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#how-to-pick-problems-to-solve-with-ml","title":"How to Pick Problems to Solve with ML","text":"

          Just like any other project prioritization, you want to look for use cases that have high impact and low cost:

          1. High-impact problems are likely to be those that address friction in your product, complex parts of your pipeline, places where cheap prediction is valuable, and generally what other people in your industry are doing.

          2. Low-cost projects are those with available data, where bad predictions are not too harmful.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#high-impact-projects","title":"High-Impact Projects","text":"

          Here are some heuristics that you can use to find high-impact ML projects:

          1. Find problems that ML takes from economically infeasible to feasible. A good resource here is the book \"Prediction Machines: The Simple Economics of AI.\" The book's central thesis is that AI reduces the cost of prediction, which is central to decision-making. Therefore, look for projects where making prediction cheaper will have a huge impact.

          2. Think about what your product needs. This article from the ML team at Spotify talks about the three principles for designing Discover Weekly, one of Spotify's most powerful and popular ML-powered features.

          3. Think about the types of problems that ML is particularly good at. One common class of problem that is overlooked is \"Software 2.0\", as coined by Andrej Kaparthy. Essentially, if you have a part of your system that is complex and manually defined, then that's potentially a good candidate to be automated with ML.

          4. Look at what other people in the industry are doing. Generally, you can read papers and blog posts from both Big Tech and top earlier-stage companies.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#low-cost-projects","title":"Low-Cost Projects","text":"

          There are three main drivers for how much a project will cost:

          1. Data availability: How hard is it to acquire data? How expensive is data labeling? How much data will be needed? How stable is the data? What data security requirements do you have?

          2. Accuracy requirement: How costly are wrong predictions? How frequently does the system need to be right to be useful? What are the ethical implications of your model making wrong predictions? It is noteworthy that ML project costs tend to scale super-linearly in the accuracy requirement.

          3. Problem difficulty: Is the problem well-defined enough to be solved with ML? Is there good published work on similar problems? How much compute does it take to solve the problem? Generally, it's hard to reason about what's feasible in ML.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#whats-hard-in-ml","title":"What's Hard in ML?","text":"

          Here are the three types of hard problems:

          1. Output is complex: The model predictions are ambiguous or in a high-dimensional structure.

          2. Reliability is required: ML systems tend to fail in unexpected ways, so anywhere you need high precision or high robustness is going to be more difficult to solve with ML.

          3. Generalization is required: These problems tend to be more in the research domain. They can deal with out-of-distribution data or do tasks such as reasoning, planning, or understanding causality.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#ml-feasibility-assessment","title":"ML Feasibility Assessment","text":"

          This is a quick checklist you can use to assess the feasibility of your ML projects:

          1. Make sure that you actually need ML.

          2. Put in the work upfront to define success criteria with all of the stakeholders.

          3. Consider the ethics of using ML.

          4. Do a literature review.

          5. Try to rapidly build a labeled benchmark dataset.

          6. Build a \"minimum\" viable model using manual rules or simple heuristics.

          7. Answer this question again: \"Are you sure that you need ML at all?\"

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#not-all-ml-projects-should-be-planned-the-same-way","title":"Not All ML Projects Should Be Planned The Same Way","text":"

          Not all ML projects have the same characteristics; therefore, they shouldn't be planned the same way. Understanding different archetypes of ML projects can help select the right approach.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#ml-product-archetypes","title":"ML Product Archetypes","text":"

          The three archetypes offered here are defined by how they interact with real-world use cases:

          1. Software 2.0 use cases: Broadly speaking, this means taking something that software or a product does in an automated fashion today and augmenting its automation with machine learning. An example of this would be improving code completion in the IDE (like Github Copilot).

          2. Human-in-the-loop systems: Machine learning can be applied for tasks where automation is not currently deployed - but where humans could have their judgment or efficiency augmented. Simply put, helping humans do their jobs better by complementing them with ML-based tools. An example of this would be turning sketches into slides, a process will usually involve humans approving the output of a machine learning model that made the slides.

          3. Autonomous systems: Systems that apply machine learning to augment existing or implement new processes without human input. An example of this would be full self-driving, where there is no opportunity for a driver to intervene in the functioning of the car.

          For each archetype, some key considerations inform how you should go about planning projects.

          1. In the case of Software 2.0 projects, you should focus more on understanding how impactful the performance of the new model is. Is the model truly much better? How can the performance continue to increase across iterations?

          2. In the case of human-in-the-loop systems, consider more the context of the human user and what their needs might be. How good does the system actually have to be to improve the life of a human reviewing its output? In some cases, a model that does even 10% better with accuracy (nominally a small increase) might have outsize impacts on human users in the loop.

          3. For autonomous systems, focus heavily on the failure rate and its consequences. When there is no opportunity for human intervention, as is the case with autonomous systems, failures need to be carefully monitored to ensure outsize harm doesn't occur. Self-driving cars are an excellent example of an autonomous system where failure rates are carefully monitored.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#data-flywheels","title":"Data Flywheels","text":"

          As you build a software 2.0 project, strongly consider the concept of the data flywheel. For certain ML projects, as you improve your model, your product will get better and more users will engage with the product, thereby generating more data for the model to get even better. It's a classic virtuous cycle and truly the gold standard for ML projects.

          As you consider implementing data flywheels, remember to know the answer to these three questions:

          1. Do you have a data loop? To build a data flywheel, you crucially need to be able to get labeled data from users in a scalable fashion. This helps increase access to high-quality data and define a data loop.

          2. Can you turn more data into a better model? This somewhat falls onto you as the modeling expert, but it may also not be the case that more data leads to significantly better performance. Make sure you can actually translate data scale into better model performance.

          3. Does better model performance lead to better product use? You need to verify that improvements with models are actually tied to users enjoying the product more and benefiting from it!

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#impact-and-feasibility-of-ml-product-archetypes","title":"Impact and Feasibility of ML Product Archetypes","text":"

          Let's visit our impact vs. feasibility matrix. Our three product archetypes differ across the spectrum.

          This is a pretty intuitive evaluation you can apply to all your ML projects: If it's harder to build (like autonomous systems), it's likely to have a greater impact! There are ways, however, to change this matrix in the context of specific projects.

          1. For Software 2.0, data flywheels can magnify impact by allowing models to get much better and increase customer delight over time.

          2. For human-in-the-loop systems, you can increase feasibility by leveraging good product design. Thoughtful design can help reduce expectations and accuracy requirements. Alternatively, a \"good enough\" mindset that prioritizes incremental delivery over time can make such systems more feasible.

          3. For autonomous systems, leveraging humans in the loop can make development more feasible by adding guardrails and reducing the potential impact of failures.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#just-get-started","title":"Just Get Started!","text":"

          With all this discussion about archetypes and impact matrices, don't forget the most important component of engineering: actually building! Dive in and get started. Start solving problems and iterate on solutions.

          One common area practitioners trip up in is tool fetishization. As MLOps and production ML have flourished, so too has the number of tools and platforms that address various aspects of the ML process. You don't need to be perfect with your tooling before driving value from machine learning. Just because Google and Uber are doing things in a very structured, at-scale way doesn't mean you need to as well!

          In this course, we will primarily focus on how to set things up the right way to do machine learning in production without overcomplicating it. This is an ML products-focused class, not an MLOps class! Check out this talk by Jacopo Tagliabue describing MLOps at Reasonable Scale for a great exposition of this mindset.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#chapter-summary_1","title":"Chapter Summary","text":"
          1. ML adds complexity. Consider whether you really need it.

          2. Make sure what you're working on is high impact, or else it might get killed.

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#3-lifecycle","title":"3 - Lifecycle","text":"

          ML adds complexity to projects and isn't always a value driver. Once you know, however, that it's the right addition to your project, what does the actual lifecycle look like? What steps do we embark upon as we execute?

          In this course, the common running example we use is of a pose estimation problem. We'll use this as a case study to demonstrate the lifecycle and illustrate various points about ML-powered products.

          Here's a graphic that visualizes the lifecycle of ML projects:

          It provides a very helpful structure. Watch from 48:00 to 54:00 to dive deeper into how this lifecycle occurs in the context of a real machine learning problem around pose estimation that Josh worked on at OpenAI.

          Let's comment on some specific nuances:

          • Machine learning projects tend to be very iterative. Each of these phases can feed back into any of the phases that go before it, as you learn more about the problem that you're working on.

            • For example, you might realize that \"Actually, it's way too hard for us to get data in order to solve this problem!\" or \"It's really difficult for us to label the pose of these objects in 3D space\".

            • A solution might actually be to go back a step in the lifecycle and set up the problem differently. For example, what if it were cheaper to annotate per pixel?

            • This could repeat itself multiple times as you progress through a project. It's a normal and expected part of the machine learning product development process.

          • In addition to iteration during execution, there's also cross-project \"platform\" work that matters! Hiring and infrastructure development are crucial to the long-term health of your project.

          • Going through this lifecycle and winning each step is what we'll cover in this class!

          "},{"location":"course/2022/lecture-1-course-vision-and-when-to-use-ml/#lecture-summary","title":"Lecture Summary","text":"

          In summary, here's what we covered in this lecture:

          1. ML is NOT a cure-all. It's a complex technology that needs to be used thoughtfully.

          2. You DON'T need a perfect setup to get going. Start building and iterate!

          3. The lifecycle of machine learning is purposefully iterative and circuitous. We'll learn how to master this process together!

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/","title":"Lecture 2: Development Infrastructure & Tooling","text":"

          Lecture by Sergey Karayev. Notes by James Le and Vishnu Rachakonda. Published August 15, 2022. Download slides.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#1-introduction","title":"1 - Introduction","text":"

          The dream of ML development is that given a project spec and some sample data, you get a continually improving prediction system deployed at scale.

          The reality is starkly different:

          • You have to collect, aggregate, process, clean, label, and version the data.

          • You have to find the model architecture and their pre-trained weights and then write and debug the model code.

          • You run training experiments and review the results, which will be fed back into the process of trying out new architectures and debugging more code.

          • You can now deploy the model.

          • After model deployment, you have to monitor model predictions and close the data flywheel loop. Basically, your users generate fresh data for you, which needs to be added to the training set.

          This reality has roughly three components: data, development, and deployment. The tooling infrastructure landscape for them is large, so we'll have three lectures to cover it all. This lecture focuses on the development component.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#2-software-engineering","title":"2 - Software Engineering","text":""},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#language","title":"Language","text":"

          For your choice of programming language, Python is the clear winner in scientific and data computing because of all the libraries that have been developed. There have been some contenders like Julia and C/C++, but Python has really won out.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#editors","title":"Editors","text":"

          To write Python code, you need an editor. You have many options, such as Vim, Emacs, Jupyter Notebook/Lab, VS Code, PyCharm, etc.

          • We recommend VS Code because of its nice features such as built-in git version control, documentation peeking, remote projects opening, linters and type hints to catch bugs, etc.

          • Many practitioners develop in Jupyter Notebooks, which is great as the \"first draft\" of a data science project. You have to put in little thought before you start coding and seeing the immediate output. However, notebooks have a variety of problems: primitive editor, out-of-order execution artifacts, and challenges to version and test them. A counterpoint to these problems is the nbdev package that lets you write and test code all in one notebook environment.

          • We recommend you use VS Code with built-in support for notebooks - where you can write code in modules imported into notebooks. It also enables awesome debugging.

          If you want to build something more interactive, Streamlit is an excellent choice. It lets you decorate Python code, get interactive applets, and publish them on the web to share with the world.

          For setting up the Python environment, we recommend you see how we did it in the lab.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#3-deep-learning-frameworks","title":"3 - Deep Learning Frameworks","text":"

          Deep learning is not a lot of code with a matrix math library like Numpy. But when you have to deploy your code onto CUDA for GPU-powered deep learning, you want to consider deep learning frameworks as you might be writing weird layer types, optimizers, data interfaces, etc.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#frameworks","title":"Frameworks","text":"

          There are various frameworks, such as PyTorch, TensorFlow, and Jax. They are all similar in that you first define your model by running Python code and then collect an optimized execution graph for different deployment patterns (CPU, GPU, TPU, mobile).

          1. We prefer PyTorch because it is absolutely dominant by measures such as the number of models, the number of papers, and the number of competition winners. For instance, about 77% of 2021 ML competition winners used PyTorch.

          2. With TensorFlow, you have TensorFlow.js (that lets you run deep learning models in your browser) and Keras (an unmatched developer experience for easy model development).

          3. Jax is a meta-framework for deep learning.

          PyTorch has excellent developer experience and is production-ready and even faster with TorchScript. There is a great distributed training ecosystem. There are libraries for vision, audio, etc. There are also mobile deployment targets.

          PyTorch Lightning provides a nice structure for organizing your training code, optimizer code, evaluation code, data loaders, etc. With that structure, you can run your code on any hardware. There are nice features such as performance and bottleneck profiler, model checkpointing, 16-bit precision, and distributed training libraries.

          Another possibility is FastAI software, which is developed alongside the fast.ai course. It provides many advanced tricks such as data augmentations, better initializations, learning rate schedulers, etc. It has a modular structure with low-level API, mid-level API, high-level API, and specific applications. The main problem with FastAI is that its code style is quite different from mainstream Python.

          At FSDL, we prefer PyTorch because of its strong ecosystem, but TensorFlow is still perfectly good. If you have a specific reason to prefer it, you are still going to have a good time.

          Jax is a more recent project from Google that is not specific to deep learning. It provides general vectorization, auto-differentiation, and compilation to GPU/TPU code. For deep learning, there are separate frameworks like Flax and Haiku. You should only use Jax for a specific need.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#meta-frameworks-and-model-zoos","title":"Meta-Frameworks and Model Zoos","text":"

          Most of the time, you will start with at least a model architecture that someone has developed or published. You will use a specific architecture (trained on specific data with pre-trained weights) on a model hub.

          • ONNX is an open standard for saving deep learning models and lets you convert from one type of format to another. It can work well but can also run into some edge cases.

          • HuggingFace has become an absolutely stellar repository of models. It started with NLP tasks but has then expanded into all kinds of tasks (audio classification, image classification, object detection, etc.). There are 60,000 pre-trained models for all these tasks. There is a Transformers library that works with PyTorch, TensorFlow, and Jax. There are 7,500 datasets uploaded by people. There's also a community aspect to it with a Q&A forum.

          • TIMM is a collection of state-of-the-art computer vision models and related code that looks cool.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#4-distributed-training","title":"4 - Distributed Training","text":"

          Let's say we have multiple machines represented by little squares above (with multiple GPUs in each machine). You are sending batches of data to be processed by a model with parameters. The data batch can fit on a single GPU or not. The model parameters can fit on a single GPU or not.

          The best case is that both your data batch and model parameters fit on a single GPU. That's called trivial parallelism. You can either launch more independent experiments on other GPUs/machines or increase the batch size until it no longer fits on one GPU.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#data-parallelism","title":"Data Parallelism","text":"

          If your model still fits on a single GPU, but your data no longer does, you have to try out data parallelism - which lets you distribute a single batch of data across GPUs and average gradients that are computed by the model across GPUs. A lot of model development work is cross-GPU, so you want to ensure that GPUs have fast interconnects.

          If you are using a server card, expect a linear speedup in training time. If you are using a consumer card, expect a sublinear speedup instead.

          Data parallelism is implemented in PyTorch with the robust DistributedDataParallel library. Horovod is another 3rd-party library option. PyTorch Lightning makes it dead simple to use either of these two libraries - where speedup seems to be the same.

          A more advanced scenario is that you can't even fit your model on a single GPU. You have to spread the model over multiple GPUs. There are three solutions to this.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#sharded-data-parallelism","title":"Sharded Data-Parallelism","text":"

          Sharded data parallelism starts with the question: What exactly takes up GPU memory?

          • The model parameters include the floats that make up our model layers.

          • The gradients are needed to do back-propagation.

          • The optimizer states include statistics about the gradients

          • Finally, you have to send a batch of data for model development.

          Sharding is a concept from databases where if you have one source of data, you actually break it into shards of data that live across your distributed system. Microsoft implemented an approach called ZeRO that shards the optimizer states, the gradients, and the model parameters. This results in an insane order of magnitude reduction in memory use, which means your batch size can be 10x bigger. You should watch the video in this article to see how model parameters are passed around GPUs as computation proceeds.

          Sharded data-parallelism is implemented by Microsoft's DeepSpeed library and Facebook's FairScale library, as well as natively by PyTorch. In PyTorch, it's called Fully-Sharded DataParallel. With PyTorch Lightning, you can try it for a massive memory reduction without changing the model code.

          This same ZeRO principle can also be applied to a single GPU. You can train a 13B-parameter model on a single V100 (32GB) GPU. Fairscale implements this (called CPU-offloading).

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#pipelined-model-parallelism","title":"Pipelined Model-Parallelism","text":"

          Model parallelism means that you can put each layer of your model on each GPU. It is trivial to implement natively but results in only one GPU being active at a time. Libraries like DeepSpeed and FairScale make it better by pipelining computation so that the GPUs are fully utilized. You need to tune the amount of pipelining on the batch size to the exact degree of how you will split up the model on the GPU.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#tensor-parallelism","title":"Tensor-Parallelism","text":"

          Tensor parallelism is another approach, which observes that there is nothing special about matrix multiplication that requires the whole matrix to be on one GPU. You can distribute the matrix over multiple GPUs. NVIDIA published the Megatron-LM repo, which does this for the Transformer model.

          You can actually use all of the three techniques mentioned above if you really want to scale a huge model (like a GPT-3 sized language model). Read this article on the technology behind BLOOM training for a taste.

          In conclusion:

          • If your model and data fit on one GPU, that's awesome.

          • If they do not, and you want to speed up training, try DistributedDataParallel.

          • If the model still doesn't fit, try ZeRO-3 or Full-Sharded Data Parallel.

          For more resources to speed up model training, look at this list compiled by DeepSpeed, MosaicML, and FFCV.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#5-compute","title":"5 - Compute","text":"

          Compute is the next essential ingredient to developing machine learning models and products.

          The compute-intensiveness of models has grown tremendously over the last ten years, as the below charts from OpenAI and HuggingFace show.

          Recent developments, including models like GPT-3, have accelerated this trend. These models are extremely large and require a large number of petaflops to train.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#gpus","title":"GPUs","text":"

          To effectively train deep learning models, GPUs are required. NVIDIA has been the superior choice for GPU vendors, though Google has introduced TPUs (Tensor Processing Units) that are effective but are only available via Google Cloud. There are three primary considerations when choosing GPUs:

          1. How much data fits on the GPU?

          2. How fast can the GPU crunch through data? To evaluate this, is your data 16-bit or 32-bit? The latter is more resource intensive.

          3. How fast can you communicate between the CPU and the GPU and between GPUs?

          Looking at recent NVIDIA GPUs, it becomes clear that a new high-performing architecture is introduced every few years. There's a difference between these chips, which are licensed for personal use as opposed to corporate use; businesses should only use server cards.

          Two key factors in evaluating GPUs are RAM and Tensor TFlops. The more RAM, the better the GPU contains large models and datasets. Tensor TFlops are special tensor cores that NVIDIA includes specifically for deep learning operations and can handle more intensive mixed-precision operations. A tip: leveraging 16-bit training can effectively double your RAM capacity!

          While these theoretical benchmarks are useful, how do GPUs perform practically? Lambda Labs offers the best benchmarks here. Their results show that the most recent server-grade NVIDIA GPU (A100) is more than 2.5 times faster than the classic V100 GPU. RTX chips also outperform the V100. AIME is also another source of GPU benchmarks.

          Cloud services such as Microsoft Azure, Google Cloud Platform, and Amazon Web Services are the default place to buy access to GPUs. Startup cloud providers like Paperspace, CoreWeave, and Lambda Labs also offer such services.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#tpus","title":"TPUs","text":"

          Let's briefly discuss TPUs. There are four generations of TPUs, and the most recent v4 is the fastest possible accelerator for deep learning. V4 TPUs are not generally available yet, but TPUs generally excel at scaling to larger and model sizes. The below charts compare TPUs to the fastest A100 NVIDIA chip.

          It can be overwhelming to compare the cost of cloud access to GPUs, so we made a tool that solves this problem! Feel free to contribute to our repository of Cloud GPU cost metrics. The tool has all kinds of nifty features like enabling filters for only the most recent chip models, etc.

          If we combine the cost metrics with performance metrics, we find that the most expensive per hour chips are not the most expensive per experiment! Case in point: running the same Transformers experiment on 4 V100s costs $1750 over 72 hours, whereas the same experiment on 4 A100s costs $250 over only 8 hours. Think carefully about cost and performance based on the model you're trying to train.

          Some helpful heuristics here are:

          1. Use the most expensive per-hour GPU in the least expensive cloud.

          2. Startups (e.g., Paperspace) tend to be cheaper than major cloud providers.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#on-prem-vs-cloud","title":"On-Prem vs. Cloud","text":"

          For on-prem use cases, you can build your own pretty easily or opt for a pre-built computer from a company like NVIDIA. You can build a good, quiet PC with 128 GB RAM and 2 RTX 3090s for about $7000 and set it up in a day. Going beyond this can start to get far more expensive and complicated. Lambda Labs offers a $60,000 machine with 8 A100s (super fast!). Tim Dettmers offers a great (slightly outdated) perspective on building a machine here.

          Some tips on on-prem vs. cloud use:

          • It can be useful to have your own GPU machine to shift your mindset from minimizing cost to maximizing utility.

          • To truly scale-out experiments, you should probably just use the most expensive machines in the least expensive cloud.

          • TPUs are worth experimenting with for large-scale training, given their performance.

          • Lambda Labs is a sponsor, and we highly encourage looking at them for on-prem and cloud GPU use!

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#6-resource-management","title":"6 - Resource Management","text":"

          Now that we've talked about raw compute, let's talk about options for how to manage our compute resources. Let's say we want to manage a set of experiments. Broadly speaking, we'll need hardware in the form of GPUs, software requirements (e.g., PyTorch version), and data to train on.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#solutions","title":"Solutions","text":"

          Leveraging best practices for specifying dependencies (e.g., Poetry, conda, pip-tools) makes the process of spinning up such experiments quick and easy on a single machine.

          If, however, you have a cluster of machines to run experiments on, SLURM is the tried and true solution for workload management that is still widely used.

          For more portability, Docker is a way to package up an entire dependency stack into a lighter-than-a-VM package. Kubernetes is the most popular way to run many Docker containers on top of a cluster. The OSS Kubeflow project helps manage ML projects that rely on Kubernetes.

          These projects are useful, but they may not be the easiest or best choice. They're great if you already have a cluster up and running, but how do you actually set up a cluster or compute platform?

          Before proceeding, FSDL prefers open source and/or transparently priced products. We discuss tools that fall into these categories, not SaaS with opaque pricing.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#tools","title":"Tools","text":"

          For practitioners all in on AWS, AWS Sagemaker offers a convenient end-to-end solution for building machine learning models, from labeling data to deploying models. Sagemaker has a ton of AWS-specific configuration, which can be a turnoff, but it brings a lot of easy-to-use old school algorithms for training and allows you to BYO algorithms as well. They're also increasing support for PyTorch, though the markup for PyTorch is about 15-20% more expensive.

          Anyscale is a company created by the makers of the Berkeley OSS project Ray. Anyscale recently launched Ray Train, which they claim is faster than Sagemaker with a similar value proposition. Anyscale makes it really easy to provision a compute cluster, but it's considerably more expensive than alternatives.

          Grid.ai is created by the PyTorch Lightning creators. Grid allows you to specify what compute parameters to use easily with \"grid run\" followed by the types of compute and options you want. You can use their instances or AWS under the hood. Grid has an uncertain future, as its future compatibility with Lightning (given their rebrand) has not been clarified.

          There are several non-ML options for spinning up compute too! Writing your own scripts, using various libraries, or even Kubernetes are all options. This route is harder.

          Determined.AI is an OSS solution for managing on-prem and cloud clusters. They offer cluster management, distributed training, and more. It's pretty easy to use and is in active development.

          With all this said, there is still room to improve the ease of experience for launching training on many cloud providers.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#7-experiment-and-model-management","title":"7 - Experiment and Model Management","text":"

          In contrast to compute, experiment management is quite close to being solved. Experiment management refers to tools and processes that help us keep track of code, model parameters, and data sets that are iterated on during the model development lifecycle. Such tools are essential to effective model development. There are several solutions here:

          • TensorBoard: A non-exclusive Google solution effective at one-off experiment tracking. It is difficult to manage many experiments.

          • MLflow: A non-exclusive Databricks project that includes model packaging and more, in addition to experiment management. It must be self-hosted.

          • Weights and Biases: An easy-to-use solution that is free for personal and academic projects! Logging starts simply with an \"experiment config\" command.

          • Other options include Neptune AI, Comet ML, and Determined AI, all of which have solid experiment tracking options.

          Many of these platforms also offer intelligent hyperparameter optimization, which allows us to control the cost of searching for the right parameters for a model. For example, Weights and Biases has a product called Sweeps that helps with hyperparameter optimization. It's best to have it as part of your regular ML training tool; there's no need for a dedicated tool.

          "},{"location":"course/2022/lecture-2-development-infrastructure-and-tooling/#8-all-in-one","title":"8 - \"All-In-One\"","text":"

          There are machine learning infrastructure solutions that offer everything--training, experiment tracking, scaling out, deployment, etc. These \"all-in-one\" platforms simplify things but don't come cheap! Examples include Gradient by Paperspace, Domino Data Lab, AWS Sagemaker, etc.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/","title":"Lecture 3: Troubleshooting & Testing","text":"

          Lecture by Charles Frye. Notes by James Le and Vishnu Rachakonda. Published August 22, 2022. Download slides.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#1-testing-software","title":"1 - Testing Software","text":"
          1. The general approach is that tests will help us ship faster with fewer bugs, but they won't catch all of our bugs.

          2. That means we will use testing tools but won't try to achieve 100% coverage.

          3. Similarly, we will use linting tools to improve the development experience but leave escape valves rather than pedantically following our style guides.

          4. Finally, we'll discuss tools for automating these workflows.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#11-tests-help-us-ship-faster-they-dont-catch-all-bugs","title":"1.1 - Tests Help Us Ship Faster. They Don't Catch All Bugs","text":"

          Tests are code we write that are designed to fail intelligibly when our other code has bugs. These tests can help catch some bugs before they are merged into the main product, but they can't catch all bugs. The main reason is that test suites are not certificates of correctness. In some formal systems, tests can be proof of code correctness. But we are writing in Python (a loosely goosey language), so all bets are off in terms of code correctness.

          Nelson Elhage framed test suites more like classifiers. The classification problem is: does this commit have a bug, or is it okay? The classifier output is whether the tests pass or fail. We can then treat test suites as a \"prediction\" of whether there is a bug, which suggests a different way of designing our test suites.

          When designing classifiers, we need to trade off detection and false alarms. If we try to catch all possible bugs, we can inadvertently introduce false alarms. The classic signature of a false alarm is a failed test - followed by a commit that fixes the test rather than the code.

          To avoid introducing too many false alarms, it's useful to ask yourself two questions before adding a test:

          1. Which real bugs will this test catch?

          2. Which false alarms will this test raise?

          If you can think of more examples for the second question than the first one, maybe you should reconsider whether you need this test.

          One caveat is that: in some settings, correctness is important. Examples include medical diagnostics/intervention, self-driving vehicles, and banking/finance. A pattern immediately arises here: If you are operating in a high-stakes situation where errors have consequences for people's lives and livelihoods, even if it's not regulated yet, it might be regulated soon. These are examples of low-feasibility, high-impact ML projects discussed in the first lecture.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#12-use-testing-tools-but-dont-chase-coverage","title":"1.2 - Use Testing Tools, But Don't Chase Coverage","text":"
          • Pytest is the standard tool for testing Python code. It has a Pythonic implementation and powerful features such as creating separate suites, sharing resources across tests, and running parametrized variations of tests.

          • Pure text docs can't be checked for correctness automatically, so they are hard to maintain or trust. Python has a nice module, [doctests], for checking code in the documentation and preventing rot.

          • Notebooks help connect rich media (charts, images, and web pages) with code execution. A cheap and dirty solution to test notebooks is adding some asserts and using nbformat to run the notebooks.

          Once you start adding different types of tests and your codebase grows, you will want coverage tools for recording which code is checked or \"covered\" by tests. Typically, this is done in lines of code, but some tools can be more fine-grained. We recommend Codecov, which generates nice visualizations you can use to drill down and get a high-level overview of the current state of your testing. Codecov helps you understand your tests and can be incorporated into your testing. You can say you want to reject commits not only where tests fail, but also where test coverage goes down below a certain threshold.

          However, we recommend against that. Personal experience, interviews, and published research suggest that only a small fraction of the tests you write will generate most of your value. The right tactic, engineering-wise, is to expand the limited engineering effort we have on the highest-impact tests and ensure that those are super high quality. If you set a coverage target, you will instead write tests in order to meet that coverage target (regardless of their quality). You end up spending more effort to write tests and deal with their low quality.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#13-use-linting-tools-but-leave-escape-valves","title":"1.3 - Use Linting Tools, But Leave Escape Valves","text":"

          Clean code is of uniform and standard style.

          1. Uniform style helps avoid spending engineering time on arguments over style in pull requests and code review. It also helps improve the utility of our version control by cutting down on noisy components of diffs and reducing their size. Both benefits make it easier for humans to visually parse the diffs in our version control system and make it easier to build automation around them.

          2. Standard style makes it easier to accept contributions for an open-source repository and onboard new team members for a closed-source system.

          One aspect of consistent style is consistent code formatting (with things like whitespace). The standard tool for that in Python is [the] [black] Python formatter. It's a very opinionated tool with a fairly narrow scope in terms of style. It focuses on things that can be fully automated and can be nicely integrated into your editor and automated workflows.

          For non-automatable aspects of style (like missing docstrings), we recommend [flake8]. It comes with many extensions and plugins such as docstring completeness, type hinting, security, and common bugs.

          ML codebases often have both Python code and shell scripts in them. Shell scripts are powerful, but they also have a lot of sharp edges. shellcheck knows all the weird behaviors of bash that often cause errors and issues that aren't immediately obvious. It also provides explanations for why it's raising a warning or an error. It's very fast to run and can be easily incorporated into your editor.

          One caveat to this is: pedantic enforcement of style is obnoxious. To avoid frustration with code style and linting, we recommend:

          1. Filtering rules down to the minimal style that achieves the goals we set out (sticking with standards, avoiding arguments, keeping version control history clean, etc.)

          2. Having an \"opt-in\" application of rules and gradually growing coverage over time - which is especially important for existing codebases (which may have thousands of lines of code that we need to be fixed).

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#14-always-be-automating","title":"1.4 - Always Be Automating","text":"

          To make the best use of testing and linting practices, you want to automate these tasks and connect to your cloud version control system (VCS). Connecting to the VCS state reduces friction when trying to reproduce or understand errors. Furthermore, running things outside of developer environments means that you can run tests automatically in parallel to other development work.

          Popular, open-source repositories are the best place to learn about automation best practices. For instance, the PyTorch Github library has tons of automated workflows built into the repo - such as workflows that automatically run on every push and pull.

          The tool that PyTorch uses (and that we recommend) is GitHub Actions, which ties automation directly to VCS. It is powerful, flexible, performant, and easy to use. It gets great documentation, can be used with a YAML file, and is embraced by the open-source community. There are other options such as pre-commit.ci, CircleCI, and Jenkins; but GitHub Actions seems to have won the hearts and minds in the open-source community in the last few years.

          To keep your version control history as clean as possible, you want to be able to run tests and linters locally before committing. We recommend pre-commit to enforce hygiene checks. You can use it to run formatting, linting, etc. on every commit and keep the total runtime to a few seconds. pre-commit is easy to run locally and easy to automate with GitHub Actions.

          Automation to ensure the quality and integrity of our software is a productivity enhancer. That's broader than just CI/CD. Automation helps you avoid context switching, surfaces issues early, is a force multiplier for small teams, and is better documented by default.

          One caveat is that: automation requires really knowing your tools. Knowing Docker well enough to use it is not the same as knowing Docker well enough to automate it. Bad automation, like bad tests, takes more time than it saves. Organizationally, that makes automation a good task for senior engineers who have knowledge of these tools, have ownership over code, and can make these decisions around automation.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#summary","title":"Summary","text":"
          1. Automate tasks with GitHub Actions to reduce friction.

          2. Use the standard Python toolkit for testing and cleaning your projects.

          3. Choose testing and linting practices with the 80/20 principle, shipping velocity, and usability/developer experience in mind.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#2-testing-ml-systems","title":"2 - Testing ML Systems","text":"
          1. Testing ML is hard, but not impossible.

          2. We should stick with the low-hanging fruit to start.

          3. Test your code in production, but don't release bad code.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#21-testing-ml-is-hard-but-not-impossible","title":"2.1 - Testing ML Is Hard, But Not Impossible","text":"

          Software engineering is where many testing practices have been developed. In software engineering, we compile source code into programs. In machine learning, training compiles data into a model. These components are harder to test:

          1. Data is heavier and more inscrutable than source code.

          2. Training is more complex and less well-defined.

          3. Models have worse tools for debugging and inspection than compiled programs.

          In this section, we will focus primarily on \"smoke\" tests. These tests are easy to implement and still effective. They are among the 20% of tests that get us 80% of the value.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#22-use-expectation-testing-on-data","title":"2.2 - Use Expectation Testing on Data","text":"

          We test our data by checking basic properties. We express our expectations about the data, which might be things like there are no nulls in this column or the completion date is after the start date. With expectation testing, you will start small with only a few properties and grow them slowly. You only want to test things that are worth raising alarms and sending notifications to others.

          We recommend [great_expectations] for data testing. It automatically generates documentation and quality reports for your data, in addition to built-in logging and alerting designed for expectation testing. To get started, check out this MadeWithML tutorial on great_expectations.

          To move forward, you want to stay as close to the data as possible:

          1. A common pattern is that there's a benchmark dataset with annotations (in academia) or an external annotation team (in the industry). A lot of the detailed information about that data can be extracted by simply looking at it.

          2. One way for data to get internalized into the organization is that at the start of the project, model developers annotate data ad-hoc (especially if you don't have the budget for an external annotation team).

          3. However, if the model developers at the start of the project move on and more developers get onboarded, that knowledge is diluted. A better solution is an internal annotation team that has a regular information flow with the model developers is a better solution.

          4. The best practice (recommended by Shreya Shankar) is to have a regular on-call rotation where model developers annotate data themselves. Ideally, these are fresh data so that all members of the team who are developing models know about the data and build intuition/expertise in the data.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#23-use-memorization-testing-on-training","title":"2.3 - Use Memorization Testing on Training","text":"

          Memorization is the simplest form of learning. Deep neural networks are very good at memorizing data, so checking whether your model can memorize a very small fraction of the full data set is a great smoke test for training. If a model can't memorize, then something is clearly very wrong!

          Only really gross issues with training will show up with this test. For example, your gradients may not be calculated correctly, you have a numerical issue, or your labels have been shuffled; serious issues like these. Subtle bugs in your model or your data are not going to show up. A way to catch smaller bugs is to include the length of run time in your test coverage. It's a good way to detect if smaller issues are making it harder for your model to learn. If the number of epochs it takes to reach an expected performance suddenly goes up, it may be due to a training bug. PyTorch Lightning has an \"overfit_batches\" feature that can help with this.

          Make sure to tune memorization tests to run quickly, so you can regularly run them. If they are under 10 minutes or some short threshold, they can be run every PR or code change to better catch breaking changes. A couple of ideas for speeding up these tests are below:

          Overall, these ideas lead to memorization tests that implement model training on different time scale and allow you to mock out scenarios.

          A solid, if expensive idea for testing training is to rerun old training jobs with new code. It's not something that can be run frequently, but doing so can yield lessons about what unexpected changes might have happened in your training pipeline. The main drawback is the potential expense of running these tests. CI platforms like CircleCI charge a great deal for GPUs, while others like Github Actions don't offer access to the relevant machines easily.

          The best option for testing training is to regularly run training with new data that's coming in from production. This is still expensive, but it is directly related to improvements in model development, not just testing for breakages. Setting this up requires a data flywheel similar to what we talked about in Lecture 1. Further tooling needed to achieve will be discussed down the line.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#24-adapt-regression-testing-for-models","title":"2.4 - Adapt Regression Testing for Models","text":"

          Models are effectively functions. They have inputs and produce outputs like any other function in code. So, why not test them like functions with regression testing? For specific inputs, we can check to see whether the model consistently returns the same outputs. This is best done with simpler models like classification models. It's harder to maintain such tests with more complex models. However, even in a more complex model scenario, regression testing can be useful for comparing changes from training to production.

          A more sophisticated approach to testing for ML models is to use loss values and model metrics to build documented test suites out of your data. Consider this similar to the test-driven development (TDD) code writing paradigm. The test that is written before your code in TDD is akin to your model's loss performance; both represent the gap between where your code needs to be and where it is. Over time, as we improve the loss metric, our model is getting closer to passing \"the test\" we've imposed on it. The gradient descent we use to improve the model can be considered a TDD approach to machine learning models!

          While gradient descent is somewhat like TDD, it's not exactly the same because simply reviewing metrics doesn't tell us how to resolve model failures (the way traditional software tests do).

          To fill in this gap, start by looking at the data points that have the highest loss. Flag them for a test suite composed of \"hard\" examples. Doing this provides two advantages: it helps find where the model can be improved, and it can also help find errors in the data itself (i.e. poor labels).

          As you examine these failures, you can aggregate types of failures into named suites. For example in a self-driving car use case, you could have a \"night time\" suite and a \"reflection\" suite. Building these test suites can be considered the machine learning version of regression testing, where you take bugs that you've observed in production and add them to your test suite to make sure that they don't come up again.

          The method can be quite manual, but there are some options for speeding it up. Partnering with the annotation team at your company can help make developing these tests a lot faster. Another approach is to use a method called Domino that uses foundation models to find errors. Additionally, for testing NLP models, use the CheckList approach.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#25-test-in-production-but-dont-yolo","title":"2.5 - Test in Production, But Don't YOLO","text":"

          It's crucial to test in true production settings. This is especially true for machine learning models, because data is an important component of both the production and the development environments. It's difficult to ensure that both are very close to one another.

          The best way to solve the training and production difference is to test in production.

          Testing in production isn't sufficient on its own. Rather, testing in production allows us to develop tooling and infrastructure that allows us to resolve production errors quickly (which are often quite expensive). It reduces pressure on other kinds of testing, but does not replace them.

          We will cover in detail the tooling needed for production monitoring and continual learning of ML systems in a future lecture.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#26-ml-test-score","title":"2.6 - ML Test Score","text":"

          So far, we have discussed writing \"smoke\" tests for ML: expectation tests for data, memorization tests for training, and regression tests for models.

          As your code base and team mature, adopt a more full-fledged approach to testing ML systems like the approach identified in the ML Test Score paper. The ML Test Score is a rubric that evolved out of machine learning efforts at Google. It's a strict rubric for ML test quality that covers data, models, training, infrastructure, and production monitoring. It overlaps with, but goes beyond some of the recommendations we've offered.

          It's rather expensive, but worth it for high stakes use cases that need to be really well-engineered! To be really clear, this rubric is really strict. Even our Text Recognizer system we've designed so far misses a few categories. Use the ML Test Score as inspiration to develop the right testing approach that works for your team's resources and needs.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#3-troubleshooting-models","title":"3 - Troubleshooting Models","text":"

          Tests help us figure out something is wrong, but troubleshooting is required to actually fix broken ML systems. Models often require the most troubleshooting, and in this section we'll cover a three step approach to troubleshooting them.

          1. \"Make it run\" by avoiding common errors.

          2. \"Make it fast\" by profiling and removing bottlenecks.

          3. \"Make it right\" by scaling model/data and sticking with proven architectures.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#31-make-it-run","title":"3.1 - Make It Run","text":"

          This is the easiest step for models; only a small portion of bugs cause the kind of loud failures that prevent a model from running at all. Watch out for these bugs in advance and save yourself the trouble of models that don't run.

          The first type of bugs that prevent models from running at all are shape errors. When the shape of the tensors don't match for the operations run on them, models can't be trained or run. Prevent these errors by keeping notes on the expected size of tensors, annotate the sizes in the code, and even step through your model code with a debugger to check tensor size as you go.

          The second type of bugs is out of memory errors. This occurs when you try to push a tensor to a GPU that is too large to fit. PyTorch Lightning has good tools to prevent this. Make sure you're using the lowest precision your training can tolerate; a good default is 16 bit precision. Another common reason for this is trying to run a model on too much data or too large a batch size. Use the autoscale batch size feature in PyTorch Lightning to pick the right size batch. You can use gradient accumulation if these batch sizes get too small. If neither of these options work, you can look into manual techniques like tensor parallelism and gradient checkpoints.

          Numerical errors also cause machine learning failures. This is when NaNs or infinite values show up in tensors. These issues most commonly appear first in the gradient and then cascade through the model. PyTorch Lightning has a good tool for tracking and logging gradient norms. A good tip to check whether these issues are caused by precision issues is to switch to Python 64 bit floats and see if that causes these issues to go away. Normalization layers tend to cause these issues, generally speaking. So watch out for how you do normalization!

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#32-make-it-fast","title":"3.2 - Make It Fast","text":"

          Once you can run a model, you'll want it to run fast. This can be tricky because the performance of DNN training code is very counterintuitive. For example, transformers can actually spend more time in the MLP layer than the attention layer. Similarly, trivial components like loading data can soak up performance.

          To solve these issues, the primary solution is to roll up your sleeves and profile your code. You can often find pretty easy Python changes that yield big results. Read these two tutorials by Charles and Horace for more details.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#33-make-it-right","title":"3.3 - Make It Right","text":"

          After you make it run fast, make the model right. Unlike traditional software, machine learning models never are truly perfect. Production performance is never perfect. As such, it might be more appropriate to say \"make it as right as needed\".

          Knowing this, making the model run and run fast allows us to make the model right through applying scale. To achieve performance benefits, scaling a model or its data are generally fruitful and achievable routes. It's a lot easier to scale a fast model. Research from OpenAI and other institutions is showing that benefits from scale can be rigorously measured and predicted across compute budget, dataset size, and parameter count.

          If you can't afford to scale yourself, consider finetuning a model trained at scale for your task.

          So far, all of the advice given has been model and task-agnostic. Anything more detailed has to be specific to the model and the relevant task. Stick close to working architectures and hyperparameters from places like HuggingFace, and try not to reinvent the wheel!

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#4-resources","title":"4 - Resources","text":"

          Here are some helpful resources that discuss this topic.

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#tweeters","title":"Tweeters","text":"
          1. Julia Evans

          2. Charity Majors

          3. Nelson Elhage

          4. kipply

          5. Horace He

          6. Andrej Karpathy

          7. Chip Huyen

          8. Jeremy Howard

          9. Ross Wightman

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#templates","title":"Templates","text":"
          1. Lightning Hydra Template

          2. NN Template

          3. Generic Deep Learning Project Template

          "},{"location":"course/2022/lecture-3-troubleshooting-and-testing/#texts","title":"Texts","text":"
          1. Reliable ML Systems talk

          2. \"ML Test Score\" paper

          3. \"Attack of the Cosmic Rays!\"

          4. \"Computers can be understood\"

          5. \"Systems that defy detailed understanding\"

          6. Testing section from MadeWithML course on MLOps

          "},{"location":"course/2022/lecture-4-data-management/","title":"Lecture 4: Data Management","text":"

          Lecture by Sergey Karayev. Notes by James Le and Vishnu Rachakonda. Published August 29, 2022. Download slides.

          "},{"location":"course/2022/lecture-4-data-management/#1-introduction","title":"1 - Introduction","text":"

          One thing people don't quite get as they enter the field of ML is how much of it deals with data - putting together datasets, exploring the data, wrangling the data, etc. The key points of this lecture are:

          1. Spend 10x as much time exploring the data as you would like to.

          2. Fixing, adding, and augmenting the data is usually the best way to improve performance.

          3. Keep it all simple!

          "},{"location":"course/2022/lecture-4-data-management/#2-data-sources","title":"2 - Data Sources","text":"

          There are many possibilities for the sources of data. You might have images, text files, logs, or database records. In deep learning, you need to get that data into a local filesystem disk next to a GPU. How you send data from the sources to training is different for each project.

          • With images, you can simply download them from S3.

          • With text files, you need to process them in some distributed way, analyze the data, select a subset, and put that on a local machine.

          • With logs and database records, you can use a data lake to aggregate and process the data.

          The basics will be the same - a filesystem, object storage, and databases.

          "},{"location":"course/2022/lecture-4-data-management/#filesystem","title":"Filesystem","text":"

          The filesystem is a fundamental abstraction. Its fundamental unit is a file - which can be text or binary, is not versioned, and is easily overwritten. The filesystem is usually on a disk connected to your machine - physically connected on-prem, attached in the cloud, or even distributed.

          The first thing to know about discs is that their speed and bandwidth range - from hard discs to solid-state discs. There are two orders of magnitude differences between the slowest (SATA SSD) and the fastest (NVMe SSD) discs. Below are some latency numbers you should know, with the human-scale numbers in parentheses:

          What formats should the data be stored on the local disc?

          • If you work with binary data like images and audio, just use the standard formats like JPEG or MP3 that it comes in.

          • If you work with metadata (like labels), tabular data, or text data, then compressed JSON or text files are just fine. Alternatively, Parquet is a table format that is fast, compact, and widely used.

          "},{"location":"course/2022/lecture-4-data-management/#object-storage","title":"Object Storage","text":"

          The object storage is an API over the filesystem. Its fundamental unit is an object, usually in a binary format (an image, a sound file, a text file, etc.). We can build versioning or redundancy into the object storage service. It is not as fast as the local filesystem, but it can be fast enough within the cloud.

          "},{"location":"course/2022/lecture-4-data-management/#databases","title":"Databases","text":"

          Databases are persistent, fast, and scalable storage and retrieval of structured data systems. A helpful mental model for this is: all the data that the databases hold is actually in the computer\\'s RAM, but the database software ensures that if the computer gets turned off, everything is safely persisted to disk. If too much data is in the RAM, it scales out to disk in a performant way.

          You should not store binary data in the database but the object-store URLs instead. Postgres is the right choice most of the time. It is an open-source database that supports unstructured JSON and queries over that JSON. SQLite is also perfectly good for small projects.

          Most coding projects that deal with collections of objects that reference each other will eventually implement a crappy database. Using a database from the beginning with likely save you time. In fact, most MLOps tools are databases at their core (e.g., W&B is a database of experiments, HuggingFace Hub is a database of models, and Label Studio is a database of labels).

          Data warehouses are stores for online analytical processing (OLAP), as opposed to databases being the data stores for online transaction processing (OLTP). You get data into the data warehouse through a process called ETL (Extract-Transform-Load): Given a number of data sources, you extract the data, transform it into a uniform schema, and load it into the data warehouse. From the warehouse, you can run business intelligence queries. The difference between OLAP and OLTP is that: OLAPs are column-oriented, while OLTPs are row-oriented.

          Data lakes are unstructured aggregations of data from multiple sources. The main difference between them and data warehouses is that data lakes use ELT (Extract-Load-Transform) process: dumping all the data in and transforming them for specific needs later.

          The big trend is unifying both data lake and data warehouse, so that structured data and unstructured data can live together. The two big platforms for this are Snowflake and Databricks. If you are really into this stuff, \"Designing Data-Intensive Applications\" is a great book that walks through it from first principles.

          "},{"location":"course/2022/lecture-4-data-management/#3-data-exploration","title":"3 - Data Exploration","text":"

          To explore the data, you must speak its language, mostly SQL and, increasingly, DataFrame. SQL is the standard interface for structured data, which has existed for decades. Pandas is the main DataFrame in the Python ecosystem that lets you do SQL-like things. Our advice is to become fluent in both to interact with both transactional databases and analytical warehouses and lakes.

          Pandas is the workhorse of Python data science. You can try DASK DataFrame to parallelize Pandas operations over cores and RAPIDS to do Pandas operations on GPUs.

          "},{"location":"course/2022/lecture-4-data-management/#4-data-processing","title":"4 - Data Processing","text":"

          Talking about data processing, it's useful to have a motivational example. Let's say we have to train a photo popularity predictor every night. For each photo, the training data must include:

          1. Metadata (such as posting time, title, and location) that sits in the database.

          2. Some features of the user (such as how many times they logged in today) that are needed to be computed from logs.

          3. Outputs of photo classifiers (such as content and style) that are needed to run the classifiers.

          Our ultimate task is to train the photo predictor model, but we need to output data from the database, compute the logs, and run classifiers to output their predictions. As a result, we have task dependencies. Some tasks can't start until others are finished, so finishing a task should kick off its dependencies.

          Ideally, dependencies are not always files but also programs and databases. We should be able to spread this work over many machines and execute many dependency graphs all at once.

          • Airflow is a standard scheduler for Python, where it's possible to specify the DAG (directed acyclic graph) of tasks using Python code. The operator in that graph can be SQL operations or Python functions.

          • To distribute these jobs, the workflow manager has a queue for the tasks and manages the workers that pull from them. It will restart jobs if they fail and ping you when the jobs are finished.

          • Prefect and Dagster are contenders to improve and replace Airflow in the long run.

          The primary advice here is not to over-engineer things. You can get machines with many CPU cores and a lot of RAM nowadays. For example, UNIX has powerful parallelism, streaming, and highly optimized tools.

          "},{"location":"course/2022/lecture-4-data-management/#5-feature-store","title":"5 - Feature Store","text":"

          Let's say your data processing generates artifacts you need for training. How do you make sure that, in production, the trained model sees the same processing taking place (which happened during training)? How do you avoid recomputation during retraining?

          Feature stores are a solution to this (that you may not need!).

          • The first mention of feature stores came from this Uber blog post describing their ML platform, Michelangelo. They had an offline training process and an online prediction process, so they built an internal feature store for both processes to be in sync.

          • Tecton is the leading SaaS solution to feature store.

          • Feast is a common open-source option.

          • Featureform is a relatively new option.

          "},{"location":"course/2022/lecture-4-data-management/#6-datasets","title":"6 - Datasets","text":"

          What about datasets specifically made for machine learning?

          HuggingFace Datasets is a great source of machine learning-ready data. There are 8000+ datasets covering a wide variety of tasks, like computer vision, NLP, etc. The Github-Code dataset on HuggingFace is a good example of how these datasets are well-suited for ML applications. Github-Code can be streamed, is in the modern Apache Parquet format, and doesn't require you to download 1TB+ of data in order to properly work with it. Another sample dataset is RedCaps, which consists of 12M image-text pairs from Reddit.

          Another interesting dataset solution for machine learning is Activeloop. This tool is particularly well equipped to work with data and explore samples without needing to download it.

          "},{"location":"course/2022/lecture-4-data-management/#7-data-labeling","title":"7 - Data Labeling","text":""},{"location":"course/2022/lecture-4-data-management/#no-labeling-required","title":"No Labeling Required","text":"

          The first thing to talk about when it comes to labeling data is...maybe we don\\'t have to label data? There are a couple of options here we will cover.

          Self-supervised learning is a very important idea that allows you to avoid painstakingly labeling all of your data. You can use parts of your data to label other parts of your data. This is very common in NLP right now. This is further covered in the foundation model lecture. The long and short of it is that models can have elements of their data masked (e.g., the end of a sentence can be omitted), and models can use earlier parts of the data to predict the masked parts (e.g., I can learn from the beginning of the sentence and predict the end). This can even be used across modalities (e.g., computer vision and text), as OpenAI CLIP demonstrates.

          Image data augmentation is an almost compulsory technique to adopt, especially for vision tasks. Frameworks like torchvision help with this. In data augmentation, samples are modified (e.g., brightened) without actually changing their core \"meaning.\" Interestingly, augmentation can actually replace labels. SimCLR is a model that demonstrates this - where its learning objective is to maximize agreement between augmented views of the same image and minimize agreement between different images.

          For other forms of data, there are a couple of augmentation tricks that can be applied. You can delete some cells in tabular data to simulate missing data. In text, there aren't established techniques, but ideas include changing the order of words or deleting words. In speech, you could change the speed, insert pauses, etc.

          Synthetic data is an underrated idea. You can synthesize data based on your knowledge of the label. For example, you can create receipts if your need is to learn how to recognize receipts from images. This can get very sophisticated and deep, so tread carefully.

          You can also get creative and ask your users to label data for you. Google Photos, as any user of the app knows, regularly gets users to label images about where people in photos are the same or different.

          This is an example of the data flywheel. Improving the data allows the user to improve the model, which in turn makes their product experience better.

          "},{"location":"course/2022/lecture-4-data-management/#labeling-solutions","title":"Labeling Solutions","text":"

          These are all great options for avoiding labeling data. However, you'll usually have to label some data to get started.

          Labeling has standard annotation features, like bounding boxes, that help capture information properly. Training annotators properly is more important than the particular kind of annotation. Standardizing how annotators approach a complex, opinable task is crucial. Labeling guidelines can help capture the exact right label from an annotator. Quality assurance is key to ensuring annotation and labeling are happening properly.

          There are a few options for sourcing labor for annotations:

          1. Full-service data labeling vendors offer end-to-end labeling solutions.

          2. You can hire and train annotators yourself.

          3. You can crowdsource annotation on a platform like Mechanical Turk.

          Full-service companies offer a great solution that abstracts the need to build software, manage labor, and perform quality checks. It makes sense to use one. Before settling on one, make sure to dedicate time to vet several. Additionally, label some gold standard data yourself to understand the data yourself and to evaluate contenders. Take calls with several contenders, ask for work samples on your data, and compare them to your own labeling performance.

          • Scale AI is the dominant data labeling solution. It offers an API that allows you to spin up tasks.

          • Additional contenders include Labelbox and Supervisely.

          • LabelStudio is an open-source solution for performing annotation yourself, with a companion enterprise version. It has a great set of features that allow you to design your interface and even plug-in models for active learning!

          • Diffgram is a competitor to Label Studio.

          • Recent offerings, like Aquarium and Scale Nucleus, have started to help concentrate labeling efforts on parts of the dataset that are most troublesome for models.

          • Snorkel is a dataset management and labeling platform that uses weak supervision, which is a similar concept. You can leverage composable rules (e.g., all sentences that have the term \"amazing\" are positive sentiments) that allow you to quickly label data faster than if you were to treat every data point the same.

          In conclusion, try to avoid labeling using techniques like self-supervised learning. If you can't, use labeling software and eventually outsource the work to the right vendor. If you can't afford vendors, consider hiring part-time work rather than crowdsourcing the work to ensure quality.

          "},{"location":"course/2022/lecture-4-data-management/#8-data-versioning","title":"8 - Data Versioning","text":"

          Data versioning comes with a spectrum of approaches:

          1. Level 0 is bad. In this case, data just lives on some file system. In these cases, the issue arises because the models are unversioned since their data is unversioned. Models are part code, part data. This will lead to the consequence of being unable to get back to a previous level of performance if need be.

          2. You can prevent this event with Level 1, where you snapshot your data each time you train. This somewhat works but is far from ideal.

          3. In Level 2, data is versioned like code, as a commingled asset with versioned code. You can use a system like git-lfs that allows you to store large data assets alongside code. This works really well!

          4. Level 3 involves specialized solutions for working with large data files, but this may not be needed unless you have a very specific need (i.e., uniquely large or compliance-heavy files).

          DVC is a great tool for this. DVC helps upload your data asset to a remote storage location every time you commit changes to the data file or trigger a commit; it functions like a fancier git-lfs. It adds features like lineage for data and model artifacts, allowing you to recreate pipelines.

          Several techniques are associated with privacy-controlled data, like federated learning, differential privacy, and learning on encrypted data. These techniques are still in research, so they aren't quite ready for an FSDL recommendation.

          "},{"location":"course/2022/lecture-5-deployment/","title":"Lecture 5: Deployment","text":"

          Lecture by Josh Tobin. Notes by James Le and Vishnu Rachakonda. Published September 5, 2022. Download slides.

          "},{"location":"course/2022/lecture-5-deployment/#introduction","title":"Introduction","text":"

          Deploying models is a critical part of making your models good, to begin with. When you only evaluate the model offline, it's easy to miss the more subtle flaws that the model has, where it doesn't actually solve the problem that your users need it to solve. Oftentimes, when we deploy a model for the first time, only then do we really see whether that model is actually doing a good job or not. Unfortunately, for many data scientists and ML engineers, model deployment is an afterthought relative to other techniques we have covered.

          Much like other parts of the ML lifecycle, we'll focus on deploying a minimum viable model as early as possible, which entails keeping it simple and adding complexity later. Here is the process that this lecture covers:

          • Build a prototype

          • Separate your model and UI

          • Learn the tricks to scale

          • Consider moving your model to the edge when you really need to go fast

          "},{"location":"course/2022/lecture-5-deployment/#1-build-a-prototype-to-interact-with","title":"1 - Build a Prototype To Interact With","text":"

          There are many great tools for building model prototypes. HuggingFace has some tools built into its playground. They have also recently acquired a startup called Gradio, which makes it easy to wrap a small UI around the model. Streamlit is another good option with a bit more flexibility.

          Here are some best practices for prototype deployment:

          1. Have a basic UI: The goal at this stage is to play around with the model and collect feedback from other folks. Gradio and Streamlit are your friends here - often as easy as adding a couple of lines of code to create a simple interface for the model.

          2. Put it behind a web URL: An URL is easier to share. Furthermore, you will start thinking about the tradeoffs you'll be making when dealing with more complex deployment schemes. There are cloud versions of Streamlit and HuggingFace for this.

          3. Do not stress it too much: You should not take more than a day to build a prototype.

          A model prototype won't be your end solution to deploy. Firstly, a prototype has limited frontend flexibility, so eventually, you want to be able to build a fully custom UI for the model. Secondly, a prototype does not scale to many concurrent requests. Once you start having users, you'll hit the scaling limits quickly.

          Above is an abstract diagram of how your application might look. The client is your user's device that interacts with your application. This device can be a browser, a vehicle, or a mobile phone. This device calls over a network to a server. The server talks to a database (where data is stored), used to power the application.

          There are different ways of structuring your application to fit an ML model inside. The prototype approach mentioned in the beginning fits into the model-in-service approach - where your hosted web server has a packaged version of the model sitting inside it. This pattern has pros and cons.

          The biggest pro is that if you are doing something complex, you get to reuse your existing infrastructure. It does not require you as a model developer to set up new things from scratch.

          However, there is a number of pronounced cons:

          1. Your web server may be written in a different language, so getting your model into that language can be difficult.

          2. Models may change more frequently than server code (especially early in the lifecycle of building your model). If you have a well-established application and a nascent model, you do not want to redeploy the entire application every time that you make an update to the model (sometimes multiple updates per day).

          3. If you have a large model to run inference on, you'll have to load that model on your web server. Large models can eat into the resources for your web server. That might affect the user experience for people using that web server, even if they are not interacting with the model.

          4. Server hardware is generally not optimized for ML workloads. In particular, you rarely will have a GPU on these devices.

          5. Your model and application may have different scaling properties, so you might want to be able to scale them differently.

          "},{"location":"course/2022/lecture-5-deployment/#2-separate-your-model-from-your-ui","title":"2 - Separate Your Model From Your UI","text":""},{"location":"course/2022/lecture-5-deployment/#21-batch-prediction","title":"2.1 - Batch Prediction","text":"

          The first pattern to pull your model from your UI is called batch prediction. You get new data in and run your model on each data point. Then, you save the results of each model inference into a database. This can work well under some circumstances. For example, if there are not a lot of potential inputs to the model, you can re-run your model on some frequency (every hour, every day, or every week). You can have reasonably fresh predictions to return to those users that are stored in your database. Examples of these problems include the early stages of building recommender systems and internal-facing tools like marketing automation.

          To run models on a schedule, you can leverage the data processing and workflow tools mentioned in our previous lecture on data management. You need to re-run data processing, load the model, run predictions, and store those predictions in your database. This is exactly a Directed Acyclic Graph workflow of data operations that tools like Dagster, Airflow, or Prefect are designed to solve. It's worth noting that there are also tools like Metaflow that are designed more for ML or data science use cases that might be potentially even an easier way to get started.

          Let's visit the pros and cons of this batch prediction pattern. Starting with the pros:

          1. Batch prediction is simple to implement since it reuses existing batch processing tools that you may already be using for training your model.

          2. It scales very easily because databases have been engineered for decades for such a purpose.

          3. Even though it looks like a simple pattern, it has been used in production by large-scale production systems for years. This is a tried-and-true pattern you can run and be confident that it'll work well.

          4. It is fast to retrieve the prediction since the database is designed for the end application to interact with.

          Switching to the cons:

          1. Batch prediction doesn't scale to complex input types. For instance, if the universe of inputs is too large to enumerate every single time you need to update your predictions, this won't work.

          2. Users won't be getting the most up-to-date predictions from your model. If the feature that goes into your model changes every hour, minute, or subsecond, but you only run your batch prediction job every day, the predictions your users see might be slightly stale.

          3. Models frequently become \"stale.\" If your batch jobs fail for some reason, it can be hard to detect these problems.

          "},{"location":"course/2022/lecture-5-deployment/#22-model-as-service","title":"2.2 - Model-as-Service","text":"

          The second pattern is called model-as-service: we run the model online as its own service. The service is going to interact with the backend or the client itself by making requests to the model service and receiving responses back.

          The pros of this pattern are:

          1. Dependability - model bugs are less likely to crash the web application.

          2. Scalability - you can choose optimal hardware for the model and scale it appropriately.

          3. Flexibility - you can easily reuse a model across multiple applications.

          The cons of this pattern are:

          1. Since this is a separate service, you add a network call when your server or client interacts with the model. That can add latency to your application.

          2. It also adds infrastructural complexity because you are on the hook for hosting and managing a separate service.

          Even with these cons, the model-as-service pattern is still a sweet spot for most ML-powered products since you really need to be able to scale independently of the application in most complex use cases. We'll walk through the basic components of building your model service - including REST APIs, dependency management, performance optimization, horizontal scaling, rollout, and managed options.

          "},{"location":"course/2022/lecture-5-deployment/#rest-apis","title":"REST APIs","text":"

          Rest APIs serve predictions in response to canonically-formatted HTTP requests. There are other alternative protocols to interact with a service that you host on your infrastructures, such as GRPC (used in TensorFlow Serving) and GraphQL (common in web development but not terribly relevant to model services).

          Unfortunately, there is currently no standard for formatting requests and responses for REST API calls.

          1. Google Cloud expects a batch of inputs structured as a list called \"instances\" (with keys and values).

          2. Azure expects a list of things called \"data\", where the data structure itself depends on what your model architecture is.

          3. AWS Sagemaker expects instances that are formatted differently than they are in Google Cloud.

          Our aspiration for the future is to move toward a standard interface for making REST API calls for ML services. Since the types of data that you might send to these services are constrained, we should be able to develop a standard as an industry.

          "},{"location":"course/2022/lecture-5-deployment/#dependency-management","title":"Dependency Management","text":"

          Model predictions depend on code, model weights, and dependencies. In order for your model to make a correct prediction, all of these dependencies need to be present on your web server. Unfortunately, dependencies are a notorious cause of trouble as it is hard to ensure consistency between your development environment and your server. It is also hard to update since even changing a TensorFlow version can change your model.

          At a high level, there are two strategies for managing dependencies:

          1. Constrain the dependencies for your model by saving your model in an agnostic format that can be run anywhere.

          2. Use containers to constrain the entire inference program.

          "},{"location":"course/2022/lecture-5-deployment/#constraining-model-dependencies","title":"Constraining Model Dependencies","text":"

          The primary way to constrain the dependencies of just your model is through a library called ONNX - the Open Neural Network Exchange. The goal of ONNX is to be an interoperability standard for ML models. The promise is that you can define a neural network in any language and run it consistently anywhere. The reality is that since the underlying libraries used to build these models change quickly, there are often bugs in the translation layer, which creates even more problems to solve for you. Additionally, ONNX doesn't deal with non-library code such as feature transformations.

          "},{"location":"course/2022/lecture-5-deployment/#containers","title":"Containers","text":"

          To understand how to manage dependencies with containers, we need to understand the differences between Docker and Virtual Machines, how Docker images are built via Docker files and constructed via layers, the ecosystem around Docker, and specific wrappers around Docker that you can use for ML.

          In a virtual machine, you package up the entire operating system (OS) as well as the libraries and applications that are built on top of that OS. A virtual machine tends to be very heavyweight because the OS itself has a lot of code and is expensive to run. A container such as Docker removes that need by packaging the libraries and applications together. A Docker engine that runs on top of your OS knows how to virtualize the OS and run the libraries/applications.

          By virtue of being lightweight, Docker is used differently than how Virtual Machines were used. A common pattern is to spin up a new Docker container for every discrete task. For example, a web application might have four containers: a web server, a database, a job queue, and a worker. These containers are run together as part of an orchestration system.

          Docker containers are created from Docker files. Each Docker file runs a sequence of steps to define the environment where you will run your code. Docker also allows you to build, store, and pull Docker containers from a Docker Hub that is hosted on some other servers or your cloud. You can experiment with a code environment that is on your local machine but will be identical to the environment you deploy on your server.

          Docker is separated into three different components:

          1. The client is where you'll be running on your laptop to build an image from a Dockerfile that you define locally using some commands.

          2. These commands are executed by a Docker Host, which can run on either your laptop or your server (with more storage or more performance).

          3. That Docker Host talks to a registry - which is where all the containers you might want to access are stored.

          With this separation of concerns, you are not limited by the amount of compute and storage you have on your laptop to build, pull, and run Docker images. You are also not limited by what you have access to on your Docker Host to decide which images to run.

          In fact, there is a powerful ecosystem of Docker images that are available on different public Docker Hubs. You can easily find these images, modify them, and contribute them back to the Hubs. It's easy to store private images in the same place as well. Because of this community and the lightweight nature of Docker, it has become incredibly popular in recent years and is ubiquitous at this point.

          There is a bit of a learning curve to Docker. For ML, there are a few open-source packages designed to simplify this: Cog, BentoML, and Truss. They are built by different model hosting providers that are designed to work well with their model hosting service but also just package your model and all of its dependencies in a standard Docker container format.

          These packages have two primary components: The first one is a standard way of defining your prediction service. The second one is a YAML file that defines the other dependencies and package versions that will go into the Docker container running on your laptop or remotely.

          If you want to have the advantages of using Docker for making your ML models reproducible but do not want to go through the learning curve of learning Docker, it's worth checking out these three libraries.

          "},{"location":"course/2022/lecture-5-deployment/#performance-optimization","title":"Performance Optimization","text":"

          What about performance monitoring?

          In this section, we focus on ways to improve the performance of your models, but we spend less time on how exactly that performance is monitored, which is a challenge in its own right.

          Luckily, one of the student projects for the 2022 cohort, Full Stack Stable Diffusion, took up that challenge and combined NVIDIA's Triton Inference Server, the Prometheus monitoring tool, and the Grafana analytics dashboarding tool to monitor a robust, scalable, and observable deployment of Stable Diffusion models.

          Check out the repo on GitHub here if you want to see a worked example of a fully-monitored DL-powered application.

          To make model inference on your machine more efficient, we need to discuss GPU, concurrency, model distillation, quantization, caching, batching, sharing the GPU, and libraries that automate these tasks for you.

          "},{"location":"course/2022/lecture-5-deployment/#gpu-or-no-gpu","title":"GPU or no GPU?","text":"

          There are some advantages to hosting your model on a GPU:

          1. It's probably the same hardware you train your model on, to begin with. That can eliminate any lost-in-translation issues.

          2. As your model gets big and your techniques get advanced, your traffic gets large. GPUs provide high throughput to deal with that.

          However, GPUs introduce a lot of complexity:

          1. They are more complex to set up.

          2. They are more expensive.

          As a result, just because your model is trained on a GPU does not mean that you need to actually host it on a GPU in order for it to work. In the early version of your model, hosting it on a CPU should suffice. In fact, it's possible to get high throughput from CPU inference at a low cost by using some other techniques.

          "},{"location":"course/2022/lecture-5-deployment/#concurrency","title":"Concurrency","text":"

          With concurrency, multiple copies of the model run in parallel on different CPUs or cores on a single host machine. To do this, you need to be careful about thread tuning. There's a great Roblox presentation on how they scaled BERT to serve a billion daily requests, just using CPUs.

          "},{"location":"course/2022/lecture-5-deployment/#model-distillation","title":"Model Distillation","text":"

          With model distillation, once you have a large model that you've trained, you can train a smaller model that imitates the behavior of your larger one. This entails taking the knowledge that your larger model learned and compressing that knowledge into a much smaller model that you may not have trained to the same degree of performance from scratch. There are several model distillation techniques pointed out in this blog post. They can be finicky to do by yourself and are infrequently used in practice. An exception is distilled versions of popular models (such as DistilBERT).

          "},{"location":"course/2022/lecture-5-deployment/#quantization","title":"Quantization","text":"

          With quantization, you execute some or potentially all of the operations in your model in a lower fidelity representation of the numbers that you are doing the math. These representations can be 16-bit floating point numbers or 8-bit integers. This introduces some tradeoffs with accuracy, but it's worth making these tradeoffs because the accuracy you lose is limited relative to the performance you gain.

          The recommended path is to use built-in quantization methods in PyTorch and TensorFlow. More specifically, HuggingFace Optimum is a good choice if you have already been using HuggingFace's pre-trained models. You can also run quantization-aware training, which often results in higher accuracy.

          "},{"location":"course/2022/lecture-5-deployment/#caching","title":"Caching","text":"

          With caching, you realize that for some ML models, some inputs are more common than others. Instead of always calling the model every time a user makes a request, let's store the common requests in a cache. Then, let's check that cache before running an expensive operation. Caching techniques can get fancy, but the basic way of doing this is to use functools library in Python.

          "},{"location":"course/2022/lecture-5-deployment/#batching","title":"Batching","text":"

          With batching, you take advantage of the fact that ML models often achieve a higher throughput when doing prediction in parallel, especially in a GPU. To accomplish this, you need to gather predictions until you have a batch, run those predictions, and return them to your user. You want to tune the batch size that deals optimally with the latency-throughput tradeoff. You also need to have a way to shortcut the process if latency becomes too long. Batching is complicated to implement, so you probably do not want to implement this yourself.

          "},{"location":"course/2022/lecture-5-deployment/#sharing-the-gpu","title":"Sharing the GPU","text":"

          Your model may not take up all of the GPU memory with your inference batch size. Why don't you run multiple models on the same GPU? This is a place where you want to use a model serving solution that supports GPU sharing out of the box.

          "},{"location":"course/2022/lecture-5-deployment/#libraries","title":"Libraries","text":"

          There are offerings from TensorFlow, PyTorch, and third-party tools from NVIDIA and Anyscale. NVIDIA's choice is probably the most powerful but can be difficult to get started with. Starting with Anyscale's Ray Serve may be an easier way to get started.

          "},{"location":"course/2022/lecture-5-deployment/#horizontal-scaling","title":"Horizontal Scaling","text":"

          If you're going to scale up to a large number of users interacting with your model, it's not going to be enough to get the most efficiency out of one server. At some point, you'll need to scale horizontally to have traffic going to multiple copies of your model running on different servers. This is called horizontal scaling. This technique involves taking traffic that would usually go to a single machine and splits across multiple machines.

          Each machine has a copy of the service, and a tool called a load balancer distributes traffic to each machine. In practice, there are two ways to do this: with either container orchestration (e.g. Kubernetes) or serverless (e.g. AWS Lambda).

          "},{"location":"course/2022/lecture-5-deployment/#container-orchestration","title":"Container Orchestration","text":"

          In container orchestration, we use Kubernetes to help manage containerized applications (in Docker containers, for example) and run them across machines.

          Kubernetes is quite interesting, but it's probably overkilled to learn too much about it if your only goal is to deploy machine learning models. There are a number of frameworks that make it easiest to deploy ML models with Kubernetes, including Kubeflow, Seldon, etc.

          "},{"location":"course/2022/lecture-5-deployment/#serverless","title":"Serverless","text":"

          If Kubernetes isn't the path for you (e.g. you don't want to have to worry about infrastructure at all), serverless is another option for deploying models. In this paradigm, app code and dependencies are packaged into .zip files or Docker containers with a single entry point function, which is a single function (e.g. model.predict()) that will be run repeatedly. This package is then deployed to a service like AWS Lambda, which almost totally manages the infrastructure required to run the code based on the input. Scaling to thousands of requests and across multiple machines is taken care of by these services. In return, you pay for the compute time that you consume.

          Since model services tend to run discretely and not continuously (like a web server), serverless is a great fit for machine learning deployment.

          Start with serverless! It's well worth the time saved in managing infrastructure and dealing with associated challenges. There are still some problems you should be aware of though.

          1. First, the size of the actual deployment package that can be sent to a serverless service tends to be limited, which makes large models impossible to run.

          2. Second, there is also a cold start problem. If there is no traffic being sent to the service in question, the service will \"wind down\" to zero compute use, at which point it takes time to start again. This lag in starting up upon the first request to the serverless service is known as the \"cold start\" time. This can take seconds or even minutes.

          3. Third, it can be hard to actually build solid software engineering concepts, like pipelines, with serverless. Pipelines enable rapid iteration, while serverless offerings often do not have the tools to support rapid, automated changes to code of the kind pipelines are designed to do.

          4. Fourth, state management and deployment tooling are related challenges here.

          5. Finally, most serverless functions are CPU only and have limited execution time. If you need GPUs for inference, serverless might not be for you quite yet. There are, however, new offerings like Banana and Pipeline that are seeking to solve this problem of serverless GPU inference!

          "},{"location":"course/2022/lecture-5-deployment/#model-rollouts","title":"Model Rollouts","text":"

          If serving is how you turn a model into something that can respond to requests, rollouts are how you manage and update these services. To be able to make updates effectively, you should be able to do the following:

          1. Roll out gradually: You may want to incrementally send traffic to a new model rather than the entirety.

          2. Roll back instantly: You may want to immediately pull back a model that is performing poorly.

          3. Split traffic between versions: You may want to test differences between models and therefore send some traffic to each.

          4. Deploy pipelines of models: Finally, you may want to have entire pipeline flows that ensure the delivery of a model.

          Building these capabilities is a reasonably challenging infrastructure problem that is beyond the scope of this course. In short, managed services are a good option for this that we'll now discuss!

          "},{"location":"course/2022/lecture-5-deployment/#managed-options","title":"Managed Options","text":"

          All of the major cloud providers offer their managed service options for model deployment. There are a number of startups offering solutions as well, like BentoML or Banana.

          The most popular managed service is AWS Sagemaker. Working with Sagemaker is easier if your model is already in a common format like a Huggingface class or a SciKit-Learn model. Sagemaker has convenient wrappers for such scenarios. Sagemaker once had a reputation for being a difficult service to work with, but this is much less the case for the clear-cut use case of model inference. Sagemaker, however, does have real drawbacks around ease of use for custom models and around cost. In fact, Sagemaker instances tend to be 50-100% more expensive than EC2.

          "},{"location":"course/2022/lecture-5-deployment/#23-takeaways","title":"2.3 - Takeaways","text":"

          To summarize this section, remember the following:

          1. You probably don't need GPU inference, which is hard to access and maintain. Scaling CPUs horizontally or using serverless can compensate.

          2. Serverless is probably the way to go!

          3. Sagemaker is a great way to get started for the AWS user, but it can get quite expensive.

          4. Don't try to do your own GPU inference; use existing tools like TFServing or Triton to save time.

          5. Watch out for new startups focused on GPU inference.

          "},{"location":"course/2022/lecture-5-deployment/#3-move-to-the-edge","title":"3 - Move to the Edge?","text":"

          Let's now consider the case of moving models out of web service and all the way to the \"edge\", or wholly on-device. Some reasons you may need to consider this include a lack of reliable internet access for users or strict data security requirements.

          If such hard and fast requirements aren't in place, you'll need to take into account the tradeoff between accuracy and latency and how this can affect the end-user experience. Put simply, if you have exhausted all options to reduce model prediction time (a component of latency), consider edge deployment.

          Edge deployment adds considerable complexity, so it should be considered carefully before being selected as an option. In edge prediction, model weights are directly loaded on our client device after being sent via a server (shown above), and the model is loaded and interacted with directly on the device.

          This approach has compelling pros and cons:

          1. Some pros to particularly call out are the latency advantages that come without the need for a network and the ability to scale for \"free,\" or the simple fact that you don't need to worry about the challenges of running a web service if all inference is done locally.

          2. Some specific cons to call out are the often limited hardware and software resources available to run machine learning models on edge, as well as the challenge of updating models since users control this process more than you do as the model author.

          "},{"location":"course/2022/lecture-5-deployment/#31-frameworks","title":"3.1 - Frameworks","text":"

          Picking the right framework to do edge deployment depends both on how you train your model and what the target device you want to deploy it on is.

          • TensorRT: If you're deploying to NVIDIA, this is the choice to go with.

          • MLKit and CoreML: For phone-based deployment on either Android or iPhone, go with MLKit for the former and CoreML for the latter.

          • PyTorch Mobile: For compatibility with both iOS and Android, use PyTorch Mobile.

          • TFLite: A great choice for using TensorFlow in a variety of settings, not just on a phone or a common device.

          • TensorFlow JS: The preferred framework for deploying machine learning in the browser.

          • Apache TVM: A library agnostic, target device agnostic option. This is the choice for anyone trying to deploy to as diverse a number of settings as possible.

          Keep paying attention to this space! There are a lot of startups like MLIR, OctoML, TinyML, and Modular that are aiming to solve some of these problems.

          "},{"location":"course/2022/lecture-5-deployment/#32-efficiency","title":"3.2 - Efficiency","text":"

          No software can help run edge-deployed models that are simply too large; model efficiency is important for edge deployment! We previously discussed quantization and distillation as options for model efficiency. However, there are also network architectures specifically designed to work better in edge settings like MobileNets. MobileNets replace the more expensive computations typical of server-run models with simpler computations and achieve acceptable performance oftentimes.

          MobileNets are a great tool for model deployments and are a great case study in model efficiency. Another similarly great case study is DistillBERT.

          "},{"location":"course/2022/lecture-5-deployment/#33-mindsets","title":"3.3 - Mindsets","text":"

          As we wrap up this lecture, keep in mind the following mindsets as you consider edge deployment:

          1. Start with the edge requirement, not the architecture choice. It's easy to pick a high-performing model architecture, only to then find it impossible to run on the edge device. Avoid this scenario at all costs! Tricks like quantization can account for up to 10x improvement, but not much more.

          2. Once you have a model that works on the edge, you can iterate locally without too much additional re-deployment. In this case, make sure to add metrics around the model size and edge performance to your experiment tracking.

          3. Treat tuning the model as an additional risk and test accordingly. With the immaturity of edge deployment frameworks, it's crucial to be especially careful when testing your model on the exact hardware you'll be deploying on.

          4. Make sure to have fallbacks! Models are finicky and prone to unpredictable behavior. In edge cases, it's especially important to have easily available fallback options for models that aren't working.

          "},{"location":"course/2022/lecture-5-deployment/#34-conclusion","title":"3.4 - Conclusion","text":"

          To summarize this section:

          1. Web deployment is easier, so use edge deployment only if you need to.

          2. Choose your framework to match the available hardware and corresponding mobile frameworks, or try Apache TVM to be more flexible.

          3. Start considering hardware constraints at the beginning of the project and choose architectures accordingly.

          "},{"location":"course/2022/lecture-6-continual-learning/","title":"Lecture 6: Continual Learning","text":"

          Lecture by Josh Tobin. Notes by James Le and Vishnu Rachakonda. Published September 12, 2022. Download slides.

          "},{"location":"course/2022/lecture-6-continual-learning/#1-overview","title":"1 - Overview","text":"

          The core justification for continual learning is that, unlike in academia, we never deal with static data distributions in the real world. The implication is that: if you want to use ML in production and build ML-powered products, you need to think about your goal of building a continual learning system, not just a static model.

          Recalling the data flywheel that we've described in this class before: as you get more users, those users bring more data. You can use the data to make a better model. A better model helps you attract even more users and build a better model over time. Andrej Karpathy described the most optimistic version of it as \"Operation Vacation\" - if we make our continual learning system good enough, it'll get better on its own over time, and ML engineers can just go on vacation.

          The reality is quite different. Initially, we gather, clean, and label some data. We train a model on that data. Then we evaluate the model and loop back to training the model to improve it based on our evaluations. Finally, we get a minimum viable model and deploy it into production.

          The problem begins after we deploy the model: we generally don't have a great way of measuring how our models are actually performing in production. Often, we just spot-check some predictions to see if they are doing what they are supposed to do. If it seems to work, then it's great. We move on to work on other things.

          Unfortunately, the ML engineer is probably not the one who discovers the problems, to begin with. Some business user or product manager gets complaints from users about a dipping metric, which leads to an investigation. This already costs the company money because the product and business teams must investigate the problem.

          Eventually, they point back to the ML engineer and the model he is responsible for. At this point, we are stuck on doing ad-hoc analyses because we don't know what caused the model failure. Eventually, we can run a bunch of SQL queries and paste together some Jupyter notebooks to figure out what the problem is. If we are lucky, we can run an A/B test. If the test looks good, we'll deploy it into production. Then, we are back to where we started - not getting ongoing feedback about how the model is doing in production.

          The upshot is that continual learning is the least well-understood part of the production ML lifecycle. Very few companies are doing this in production today. This lecture focuses on how to improve different steps of the continual learning process, pointers to learn about each step, and recommendations for doing it pragmatically and adopting it gradually.

          "},{"location":"course/2022/lecture-6-continual-learning/#2-how-to-think-about-continual-learning","title":"2 - How to Think About Continual Learning","text":"

          Our opinionated view about continual learning is training a sequence of models that can adapt to a continuous stream of data that comes into production. You can think about continual learning as an outer loop in your training process. On one end of the loop is your application, which consists of a model and some other code that users interact with that application by submitting requests, getting predictions back, and submitting feedback about how well the model did at providing that prediction.

          The continual learning loop starts with logging, which is how we get all the data into the loop. Then we have data curation, triggers for the retraining process, dataset formation to pick the data to retrain on, the training process itself, and offline testing to validate whether the retrained model is good enough to go into production. After the model is deployed, we have online testing, and that brings the next version of the model into production, where we can start the loop all over.

          Each of these stages passes the output to the next step. Output is defined by a set of rules. These rules combine to form our retraining strategy. Let's discuss what the retraining strategy looks like for each stage:

          At the logging stage, the key question answered by the retraining strategy is what data should we store? At the end of this stage, we have an \"infinite stream\" of potentially unlabeled data coming from production and can be used for downstream analysis.

          At the curation stage, the key rules we need to define are what data from that infinite stream will we prioritize for labeling and potential retraining? At the end of this stage, we have a reservoir of candidate training points that have labels and are fully ready to be fed back into a training process.

          At the retraining trigger stage, the key question is when should we retrain? The output of this stage is a signal to kick off a retraining job.

          At the dataset formation stage, the key rules we need to define are from this entire reservoir of data, what specific subset of that data are we using to train on for this particular training job? The output of this stage is a view into that reservoir or training data that specifies the exact data points to be used for the training job.

          At the offline testing stage, the key rule we need to define is what \"good enough\" looks like for all stakeholders. The output of this stage is equivalent to a \"pull request\" report card for your model with a clear sign-off process. Once you are signed off, the new model will roll out into production.

          Finally, at the deployment and online testing stage, the key rule to define is how do we know if this deployment was successful? The output of this stage is a signal to roll this model out fully to all of your users.

          In an idealized world, from an ML engineer's perspective, once the model is deployed, the first version of the model is to not retrain the model directly. Instead, we want the model to sit on top of the retraining strategy and try to improve that strategy over time. Rather than training models daily, we look at metrics about how well the strategy is working and how well it's solving the task of improving our model over time in response to changes in the world. The input that we provide is by tuning the strategy to do a better job of solving that task.

          For most ML engineers, our jobs don't feel like that at a high level. Our retraining strategy is just retraining models whenever we feel like it. We can get good results from ad-hoc retraining, but when you start getting consistent results and no one is actively working on the model day to day anymore, then it's worth starting to add some automation. Alternatively, if you find yourself needing to retrain the model more than once a week (or even more frequently than that) to deal with changing results in the real world, then it's worth investing in automation just to save yourself.

          "},{"location":"course/2022/lecture-6-continual-learning/#3-periodic-retraining","title":"3 - Periodic Retraining","text":"

          The first baseline retraining strategy that you should consider after you move on from ad-hoc is just periodic retraining:

          1. At the logging stage, we simply log everything.

          2. At the curation stage, we sample uniformly at random from the data that we've logged up until we get the maximum number of data points that we are able to handle. Then we label them using some automated tools.

          3. Our retraining trigger will just be periodic.

          4. We train once a week, but we do it on the last month's data, for example.

          5. Then we compute the test set accuracy after each training, set a threshold on that, or more likely manual review the results each time, and spot-check some of the predictions.

          6. When we deploy the model, we do spot evaluations of that deployed model on a few individual predictions to make sure things look healthy.

          Periodic retraining won't work in every circumstance. There are several failure modes:

          1. The first category is that you have more data than you can log or label. If you have a high volume of data, you might need to be more careful about what data to sample and enrich, particularly if that data comes from a long-tail distribution - where you have edge cases that your model needs to perform well on, but those edge cases might not be caught by just doing standard uniform sampling. Or if that data is expensive to label like in a human-in-the-loop scenario - where you need custom labeling rules or labeling is a part of the product. In either of those cases, you need to be more careful about what subset of your data you log and enrich to be used down the road.

          2. The second category has to do with managing the cost of retraining. If your model is expensive to retrain, retraining it periodically is not going to be the most cost-efficient way to go, especially if you do it on a rolling window of data every single time. You will leave a lot of performance on the table by not retraining more frequently. You can partially solve this by increasing the retraining frequency, but this will increase the costs even further.

          3. The final failure mode is situations where you have a high cost of bad predictions. Every time you retrain your model, it introduces risk, which comes from the fact that the data you're training the model on might be bad in some way. It might be corrupted, might have been attacked by an adversary, or might not be representative anymore of all the cases that your model needs to perform well on. The more frequently you retrain and the more sensitive you are to model failures, the more thoughtful you need to be about careful model evaluation such that you are not unduly taking on too much risk from frequent retraining.

          "},{"location":"course/2022/lecture-6-continual-learning/#4-iterating-on-your-retraining-strategy","title":"4 - Iterating On Your Retraining Strategy","text":"

          The main takeaway from this section is that we will use monitoring and observability to determine what changes we want to make to our retraining strategy.

          1. We'll do that by monitoring just the metrics that actually that matter and using all other metrics for debugging.

          2. When we debug an issue with our model, that will lead to potentially retraining our model. But more broadly than that, we can think of it as a change to the retraining strategy - changing our retraining triggers, our offline tests, our sampling strategies, the metrics for observability, etc.

          3. As we get more confident in our monitoring, we can introduce more automation to our system.

          There are no real standards or best practices on model monitoring yet. The main principles we'll follow are: (1) We'll focus on monitoring what matters and what breaks empirically; and (2) We'll compute other signals too but use them for observability and debugging.

          What does it mean to monitor a model in production? We think about it as: You have some metric to assess the model quality (i.e, accuracy) and a time series of how that metric changes over time. The question you try to answer is: Is this bad or okay? Do you need to pay attention to this degradation or not?

          The questions we'll need to answer are:

          1. What metrics should we be looking at when we are monitoring?

          2. How can we tell if those metrics are bad and warrant an intervention?

          3. What are the tools that help us with this process?

          "},{"location":"course/2022/lecture-6-continual-learning/#what-metrics-to-monitor","title":"What Metrics to Monitor","text":"

          Choosing the right metric to monitor is probably the most important part of this process. Below you can find different types of metrics ranked in order of how valuable they are.

          "},{"location":"course/2022/lecture-6-continual-learning/#outcomes-and-feedback-from-users","title":"Outcomes and Feedback From Users","text":"

          The most valuable one to look at is outcome data or feedback from your users. Unfortunately, there are no one-size-fits-all ways to do this because it depends a lot on the specifics of the product you are building. This is more of a product management question of how to design your product in a way that you can capture feedback from your users as part of the product experience.

          "},{"location":"course/2022/lecture-6-continual-learning/#model-performance-metrics","title":"Model Performance Metrics","text":"

          The next most valuable signal to look at is model performance metrics. These are offline metrics such as accuracy. This is less useful than user feedback because of loss mismatch. A common experience many ML practitioners have is that improving model performance leads to the same or worse outcome. There's very little excuse for not doing this. To some degree, you can label some production data each day by setting up an on-call rotation or throwing a labeling party. These practices will give you some sense of how your model performance trends over time.

          "},{"location":"course/2022/lecture-6-continual-learning/#proxy-metrics","title":"Proxy Metrics","text":"

          The next best thing to look at is proxy metrics, which are correlated with bad model performance. These are mostly domain-specific. For example, if you are building text generation with a language model, two examples would be repetitive and toxic outputs. If you are building a recommendation system, an example would be the share of personalized responses. Edge cases can be good proxy metrics. If there are certain problems you know that you have with your model, if those increase in prevalence, that might mean your model is not doing very well.

          There's an academic direction that aims at being able to take any metric you care about and approximate it on previously unseen data. How well do we think our model is doing on this new data? Which would make these proxy metrics a lot more practically useful? There are a number of different approaches here: from training an auxiliary model to predict how well your main model might do on this offline data, to using heuristics and human-in-the-loop methods.

          An unfortunate result from this literature is that it's not possible to have a single method you use in all circumstances to approximate how your model is doing on out-of-distribution data. Let's say you are looking at the input data to predict how the model will perform on those input points. Then the label distribution changes. As a result, you won't be able to take into account that change in your approximate metric.

          "},{"location":"course/2022/lecture-6-continual-learning/#data-quality","title":"Data Quality","text":"

          The next signal to look at is data quality. Data quality testing is a set of rules you apply to measure the quality of your data. This deals with questions such as: How well does a piece of information reflect reality? Does it fulfill your expectations of what's comprehensive? Is your information available when you need it? Some common examples include checking whether the data has the right schema, the data is in the expected range, and the number of records is not anomalous.

          This is useful because data problems tend to be the most common issue with ML models in practice. In a Google report which covered 15 years of different pipeline outages with a particular ML model, most of the outages that happened with that model were distributed systems problems, commonly data problems.

          "},{"location":"course/2022/lecture-6-continual-learning/#distribution-drift","title":"Distribution Drift","text":""},{"location":"course/2022/lecture-6-continual-learning/#why-measure-distribution-drift","title":"Why Measure Distribution Drift?","text":"

          Your model's performance is only guaranteed on data sampled from the same distribution as it was trained on. This can have a huge impact in practice. A recent example includes changes in model behavior during the pandemic. A bug in the retraining pipeline caused the recommendations not to be updated for new users, leading to millions of dollars in revenue lost.

          "},{"location":"course/2022/lecture-6-continual-learning/#types-of-distribution-drift","title":"Types of Distribution Drift","text":"

          Distribution drift manifests itself in different ways in the wild:

          1. Instantaneous drift happens when a model is deployed in a new domain, a bug is introduced in the pre-processing pipeline, or a big external shift like COVID occurs.

          2. Gradual drift happens when users\\' preferences change or new concepts get introduced to the corpus over time.

          3. Periodic drift happens when users' preferences are seasonal or people in different time zones use your model differently.

          4. Temporary drift happens when a malicious user attacks your model, a new user tries your product and churns, or someone uses your product in an unintended way.

          "},{"location":"course/2022/lecture-6-continual-learning/#how-to-measure-it","title":"How to Measure It?","text":"

          How to tell if your distribution is drifted?

          1. Your first select a window of \"good\" data to serve as a reference. To select that reference, you can use a fixed window of production data you believe to be healthy. Some papers advocate for using a sliding window of production data. In practice, most of the time you probably should use your validation data as the reference.

          2. Once you have that reference data, you select a new window of production data to measure your distribution distance on. This is not a super principled approach and tends to be problem-dependent. A pragmatic solution is to pick one or several window sizes with a reasonable amount of data and slide them.

          3. Finally, once you have your reference window and production window, you compare the windows using a distribution distance metric.

          "},{"location":"course/2022/lecture-6-continual-learning/#what-metrics-to-use","title":"What Metrics To Use?","text":"

          Let's start by considering the one-dimensional case, where you have a particular feature that is one-dimensional and can compute a density of that feature on your reference/production windows. You want some metric that approximates the distance between these two distributions.

          There are a few options here:

          1. The commonly recommended ones are the KL divergence and the KS test. But they are actually bad choices.

          2. Sometimes-better options would be (1) infinity norm or 1-norm of the diff between probabilities for each category, and (2) Earth-mover's distance (a more statistically principled approach).

          Check out this Gantry blog post to learn more about why the commonly recommended metrics are not so good and the other ones are better.

          "},{"location":"course/2022/lecture-6-continual-learning/#dealing-with-high-dimensional-data","title":"Dealing with High-Dimensional Data","text":"

          In the real world for most models, we have potentially many input features or even unstructured data that is very high-dimensional. How do we deal with detecting distribution drift in those cases?

          1. You can measure drift on all of the features independently: If you have a lot of features, you will hit the multiple hypothesis testing problem. Furthermore, this doesn't capture cross-correlation.

          2. You can measure drift on only the important features: Generally speaking, it's a lot more useful to measure drift on the outputs of the model than the inputs. You can also rank the importance of your input features and measure drift on the most important ones.

          3. You can look at metrics that natively compute or approximate the distribution distance between high-dimensional distributions: The two that are worth checking out are maximum mean discrepancy and approximate Earth-mover's distance. The caveat here is that they are pretty hard to interpret.

          A more principled way to measure distribution drift for high-dimensional inputs to the model is to use projections. The idea of a projection is that:

          1. You first take some high-dimensional input to the model and run that through a function.

          2. Each data point your model makes a prediction on gets tagged by this projection function. The goal of this projection function is to reduce the dimensionality of that input.

          3. Once you've reduced the dimensionality, you can do drift detection on that lower-dimensional representation of the high-dimensional data.

          This approach works for any kind of data, no matter what the dimensionality is or what the data type is. It's also highly flexible. There are different types of projections that can be useful: analytical projections (e.g., mean pixel value, length of sentence, or any other function), random projections (e.g., linear), and statistical projections (e.g., autoencoder or other density models, T-SNE).

          "},{"location":"course/2022/lecture-6-continual-learning/#cons-of-looking-at-distribution-drift","title":"Cons of Looking at Distribution Drift","text":"

          Models are designed to be robust to some degree of distribution drift. The figure on the left above shows a toy example to demonstrate this point. We have a classifier that's trained to predict two classes. We've induced a synthetic distribution shift to shift the red points on the top left to bottom. These two distributions are extremely different, but the model performs equally well on the training data and the production data. In other words, knowing the distribution shift doesn't tell you how the model has reacted to that shift.

          The figure on the right is a research project that used data generated from a physics simulator to solve problems on real-world robots. The training data was highly out of distribution (low-fidelity, random images). However, by training on this set of training data, the model was able to generalize to real-world scenarios on the test data.

          Beyond the theoretical limitations of measuring distribution drift, this is just hard to do in practice. You have to window size correctly. You have to keep all this data around. You need to choose metrics. You need to define projections to make your data lower-dimensional.

          "},{"location":"course/2022/lecture-6-continual-learning/#system-metrics","title":"System Metrics","text":"

          The last thing to consider looking at is your standard system metrics such as CPU utilization, GPU memory usage, etc. These don't tell you anything about how your model is actually performing, but they can tell you when something is going wrong.

          "},{"location":"course/2022/lecture-6-continual-learning/#practical-recommendations","title":"Practical Recommendations","text":"

          We also want to look at how hard it is to compute the aforementioned stages in practice. As seen below, the Y-axis shows the value of each signal and the X-axis shows the feasibility of measuring each signal.

          1. Measuring outcomes or feedback has pretty wide variability in terms of how feasible it is to do, as it depends on how your product is set up.

          2. Measuring model performance tends to be the least feasible thing to do because it involves collecting some labels.

          3. Proxy metrics are easier to compute because they don't involve labels.

          4. System metrics and data quality metrics are highly feasible because you have off-the-shelf tools for them.

          Here are our practical recommendations:

          1. Basic data quality checks are zero-regret, especially if you are retraining your model.

          2. Get some way to measure feedback, model performance, or proxy metrics, even if it's hacky or not scalable.

          3. If your model produces low-dimensional outputs, monitoring those for distribution shifts is also a good idea.

          4. As you evolve your system, practice the observability mindset.

          While you can think of monitoring as measuring the known unknowns (e.g., setting alerts on a few key metrics), observability is measuring unknown unknowns (e.g., having the power to ask arbitrary questions about your system when it breaks). An observability mindset means two implications:

          1. You should keep around the context or raw data that makes up the metrics that you are computing since you want to be able to drill all the way down to potentially the data points themselves that make up the degraded metric.

          2. You can go crazy with measurement by defining a lot of different metrics. You shouldn't necessarily set alerts on each of those since you don't want too many alerts. Drift is a great example since it is useful for debugging but less so for monitoring.

          Finally, it's important to go beyond aggregate metrics. If your model is 99% accurate in aggregate but only 50% accurate for your most important user, is it still \"good\"? The way to deal with this is by flagging important subgroups or cohorts of data and alerting on important metrics across them. Some examples are categories you don't want to be biased against, \"important\" categories of users, and categories you might expect to perform differently on (languages, regions, etc.).

          "},{"location":"course/2022/lecture-6-continual-learning/#how-to-tell-if-those-metrics-are-bad","title":"How To Tell If Those Metrics are \"Bad\"","text":"

          We don't recommend statistical tests (e.g., KS-Test) because they try to return a p-value for the likelihood that the data distributions are not the same. When you have a lot of data, you will get very small p-values for small shifts. This is not what we actually care about since models are robust to a small number of distribution shifts.

          Better options than statistical tests include fixed rules, specific ranges, predicted ranges, and unsupervised detection of new patterns. This article on dynamic data testing has the details.

          "},{"location":"course/2022/lecture-6-continual-learning/#tools-for-monitoring","title":"Tools for Monitoring","text":"

          The first category is system monitoring tools, a premature category with different companies in it (Datadog, Honeycomb, New Relic, Amazon CloudWatch, etc.). They help you detect problems with any software system, not just ML models. They provide functionality for setting alarms when things go wrong. Most cloud providers have decent monitoring solutions, but if you want something better, you can look at monitoring-specific tools to monitor anything.

          This raises the question of whether we should just use these system monitoring tools to monitor ML metrics as well. This blog post explains that it's feasible but highly painful due to many technical reasons. Thus, it's better to use ML-specific tools.

          Two popular open-source monitoring tools are EvidentlyAI and whylogs.

          • Both are similar in that you provide them with samples of data and they produce a nice report that tells you where their distribution shifts are.

          • The big limitation of both is that they don't solve the data infrastructure and the scale problem. You still need to be able to get all that data into a place where you can analyze it with these tools.

          • The main difference between them is that whylogs is more focused on gathering data from the edge by aggregating the data into statistical profiles at inference time. You don't need to transport all the data from your inference devices back to your cloud.

          Lastly, there are a bunch of different SaaS vendors for ML monitoring and observability: Gantry, Aporia, Superwise, Arize, Fiddler, Arthur, etc.

          "},{"location":"course/2022/lecture-6-continual-learning/#5-retraining-strategy","title":"5 - Retraining Strategy","text":"

          We\u2019ve talked about monitoring and observability, which allow you to identify issues with your continual learning system. Now, we\u2019ll talk about how we will fix the various stages of the continual learning process based on what we learn from monitoring and observability.

          "},{"location":"course/2022/lecture-6-continual-learning/#logging","title":"Logging","text":"

          The first stage of the continual learning loop is logging. As a reminder, the goal of logging is to get data from your model to a place where you can analyze it. The key question to answer here is: \u201cwhat data should I actually log?\u201d

          For most of us, the best answer is just to log all of the data. Storage is cheap. It's better to have data than not to have it. There are, however, some situations where you can't do that. For example, if you have too much traffic going through your model to the point where it's too expensive to log all of it, or if you have data privacy concerns, or if you're running your model at the edge, you simply may not be able to log all your data.

          In these situations, there are two approaches that you can take. The first approach is profiling. With profiling, rather than sending all the data back to your cloud and then using that to monitor, you instead compute statistical profiles of your data on the edge that describe the data distribution that you're seeing. This is great from a data security perspective because it doesn't require you to send all the data back home. It minimizes your storage cost. Finally, you don't miss things that happen in the tails, which is an issue for the next approach. That'll describe the place to use. This approach is best used for security-critical applications. Computing statistical profiles is a pretty interesting topic in computer science and data summarization that is worth checking out if you\u2019re interested in this approach.

          The other approach is sampling. With sampling, you'll just take certain data points and send those back to your monitoring and logging system. The advantage of sampling is that it has minimal impact on your inference resources. You don't have to actually spend the computational budget to compute profiles. You also get to have access to the raw data for debugging and retraining, albeit a smaller amount. This is the approach we recommend for any other kind of application.

          "},{"location":"course/2022/lecture-6-continual-learning/#curation","title":"Curation","text":"

          The next step in the continual learning loop is curation. The goal of curation is to take the infinite stream of production data, which is potentially unlabeled, and turn it into a finite reservoir of enriched data suitable for training. Here, we must answer, \u201cwhat data should be enriched?\u201d

          You could sample and enrich data randomly, but that may not prove helpful to your model. Importantly, you miss rare classes or events. A better approach can be to perform stratified subsampling, wherein you sample specific proportions of individuals from various subpopulations (e.g. race). The most advanced strategy for picking data to enrich is to curate data points that are somehow interesting for the purpose of improving your model.

          There are a few different ways of doing this: user-driven curation loops via feedback loops, manual curation via error analysis, and automatic curation via active learning.

          User-driven curation is a great approach that is easy to implement, assuming you have a clear way of gathering user feedback. If your user churns, clicks thumbs down, or performs some other similar activity on the model\u2019s output, you have an easy way of understanding data that could be enriched for future training jobs.

          If you don't have user feedback, or if you need even more ways of gathering interesting data from your system, the second most effective way is by doing manual error analysis. In this approach, we look at the errors that our model is making, reason about the different types of failure modes that we're seeing, and try to write functions or rules that help capture these error modes. We'll use those functions to gather more data that might represent those error cases. Some examples of these function-based approaches are similarity-based curation, which uses nearest neighbors, and projection-based curation, wherein we train a new function or model to recognize key data points.

          The last way to curate data is to do so automatically using a class of algorithms called active learning. The way active learning works is that, given a large amount of unlabeled data, we will try to determine which data points would improve model performance the most (if you were to label those data points next and train on them). These algorithms define a sampling strategy, rank all of your unlabeled examples using a scoring function that defines the sampling strategy, and mark the data points with the highest scores for future labeling.

          There are a number of different scoring function approaches that are shown below.

          1. Most uncertain: sample low-confidence and high-entropy predictions or predictions that an ensemble disagrees on.
          2. Highest predicted loss: train a separate model that predicts loss on unlabeled points, then sample the highest predicted loss.
          3. Most different from labels: train a model to distinguish labeled and unlabeled data, then sample the easiest to distinguish.
          4. Most representative: choose points such that no data is too far away from anything we sampled.
          5. Big impact on training: choose points such that the expected gradient is large or points where the model changes its mind the most about its prediction during training.

          Uncertainty scoring tends to be the most commonly used method since it is simple and easy to implement.

          You might have noticed that there's a lot of similarity between some of the ways that we do data curation and the way that we do monitoring. That's no coincidence--monitoring and data curation are two sides of the same coin! They're both interested in solving the problem of finding data points where the model may not be performing well or where we're uncertain about how the model is performing on those data points.

          Some examples of people practically applying data curation are OpenAI\u2019s DALL-E 2, which uses active learning and manual curation, Tesla, which uses feedback loops and manual curation, and Cruise, which uses feedback loops.

          Some tools that help with data curation are Scale Nucleus, Aquarium, and Gantry.

          To summarize then, here are our final set of recommendations for applying data curation.

          1. Random sampling is a fine starting point. If you want to avoid bias or have rare classes, do stratified sampling instead.
          2. If you have a feedback loop, then user-driven curation is a no-brainer. If not, confidence-based active learning is easy to implement.
          3. As your model performance increases, you\u2019ll have to look harder for challenging training points. Manual techniques are unavoidable and should be embraced. Know your data!
          "},{"location":"course/2022/lecture-6-continual-learning/#retraining-triggers","title":"Retraining Triggers","text":"

          After we've curated our infinite stream of unlabeled data down to a reservoir of labeled data that's ready to potentially train on, the next thing that we'll need to decide is \u201cwhat trigger are we gonna use to retrain?\u201d

          The main takeaway here is that moving to automated retraining is not always necessary. In many cases, just manually retraining is good enough. It can save you time and lead to better model performance. It's worth understanding when it makes sense to actually make the harder move to automated retraining.

          The main prerequisite for moving to automated retraining is being able to reproduce model performance when retraining in a fairly automated fashion. If you're able to do that and you are not really working on the model actively, it's probably worth implementing some automated retraining. As a rule of thumb, if you\u2019re retraining the model more than once a month, automated retraining may make sense.

          When it's time to move to automated training, the main recommendation is to just keep it simple and retrain periodically, e.g. once a week. The main question though is, how do you pick the right training schedule? The recommendation here is to:

          1. Apply measurement to figure out a reasonable retraining schedule.
          2. Plot your model performance and degradation over time.
          3. Compare how retraining the model at various intervals would have resulted in improvements to its performance.

          As seen below, the area between the curves represents the opportunity cost, so always remember to balance the upside of retraining with the operational costs of retraining.

          This is a great area for future academic research! More specifically, we can look at ways to automate determining the optimal retraining strategy based on performance decay, sensitivity to performance, operational costs, and retraining costs.

          An additional option for retraining, rather than time-based intervals, is performance triggers (e.g. retrain when the model accuracy dips below 90%). This helps react more quickly to unexpected changes and is more cost-optimal, but requires very good instrumentation to process these signals along with operational complexity.

          An idea that probably won't be relevant but is worth thinking about is online learning. In this paradigm, you train on every single data point as it comes in. It's not very commonly used in practice.

          A version of this idea that is used fairly frequently in practice is online adaptation. This method operates not at the level of retraining the whole model itself but rather on the level of adapting the policy that sits on top of the model. What is a policy you ask? A policy is the set of rules that takes the raw prediction that the model made, like the score or the raw output of the model, and turns it into the output the user sees. In online adaptation, we use algorithms like multi-armed bandits to tune these policies. If your data changes very frequently, it is worth looking into this method.

          "},{"location":"course/2022/lecture-6-continual-learning/#dataset-formation","title":"Dataset Formation","text":"

          Imagine we've fired off a trigger to start a new training job. The next question we need to answer is, among all of the labeled data in our reservoir of data, what specific data points should we train on for this particular new training job?

          We have four options here. Most of the time in deep learning, we'll just use the first option and train on all the data that we have available to us. Remember to keep your data version controlled and your curation rules consistent.

          If you have too much data to do that, you can use recency as a heuristic for a second option and train on only a sliding window of the most recent data (if recency is important) or sample a smaller portion (if recency isn\u2019t). In the latter case, compare the aggregate statistics between the old and new windows to ensure there aren\u2019t any bugs. It\u2019s also important in both cases to compare the old and new datasets as they may not be related in straightforward ways.

          A useful third option is online batch selection, which can be used when recency doesn\u2019t quite matter. In this method, we leverage label-aware selection functions to choose which items in mini-batches to train on.

          A more difficult fourth option that isn\u2019t quite recommended is continual fine-tuning. Rather than retraining from scratch every single time, you train your existing model on just new data. The reason why you might wanna do this primarily is because it's much more cost-effective. The paper below shares some findings from GrubHub, where they found a 45x cost improvement by doing this technique relative to sliding windows.

          The big challenge here is that unless you're very careful, it's easy for the model to forget what it learned in the past. The upshot is that you need to have mature evaluation practices to be very careful that your model is performing well on all the types of data that it needs to perform well on.

          "},{"location":"course/2022/lecture-6-continual-learning/#offline-testing","title":"Offline Testing","text":"

          After the previous steps, we now have a new candidate model that we think is ready to go into production. The next step is to test that model. The goal of this stage is to produce a report that our team can sign off on that answers the question of whether this new model is good enough or whether it's better than the old model. The key question here is, \u201cwhat should go into that report?\u201d

          This is a place where there's not a whole lot of standardization, but the recommendation we have here is to compare your current model with the previous version of the model on all of the metrics that you care about, all of the subsets of data that you've flagged are important, and all the edge cases you\u2019ve defined. Remember to adjust the comparison to account for any sampling bias.

          Below is a sample comparison report. Note how the validation set is broken out into concrete subgroups. Note also how there are specific validation sets assigned to common error cases.

          In continual learning, evaluation sets are dynamically refined just as much as training sets are. Here are some guidelines for how to manage evaluation sets in a continual learning system:

          1. As you curate new data, add some of it to your evaluation sets. For example, if you change how you do sampling, add that newly sampled data to your evaluation set. Or if you encounter a new edge case, create a test case for it.
          2. Corollary 1: you should version control your evaluation sets as well.
          3. Corollary 2: if your data changes quickly, always hold out the most recent data for evaluation.

          Once you have the testing basics in place, a more advanced option that you can look into here is expectation testing. Expectation tests work by taking pairs of examples where you know the relationship between the two. These tests help a lot with understanding the generalizability of models.

          Just like how data curation is highly analogous to monitoring, so is offline testing. We want to observe our metrics, not just in aggregate but also across all of our important subsets of data and across all of our edge cases. One difference between these two is that you will have different metrics available in offline testing and online testing. For example, you\u2019re much more likely to have labels offline. Online, you\u2019re much more likely to have feedback. We look forward to more research that can predict online metrics from offline ones.

          "},{"location":"course/2022/lecture-6-continual-learning/#online-testing","title":"Online Testing","text":"

          Much of this we covered in the last lecture, so we\u2019ll keep it brief! Use shadow mode and A/B tests, roll out models gradually, and roll back models if you see issues during rollout.

          "},{"location":"course/2022/lecture-6-continual-learning/#6-the-continual-improvement-workflow","title":"6 - The Continual Improvement Workflow","text":"

          To tie it all together, we\u2019ll conclude with an example. Monitoring and continual learning are two sides of the same coin. We should be using the signals that we monitor to very directly change our retraining strategy. This section describes the future state that comes as a result of investing in the steps laid out previously.

          Start with a place to store and version your strategy. The components of your continual learning strategy should include the following:

          • Inputs, predictions, user feedback, and labels.
          • Metric definitions for monitoring, observability, and offline testing.
          • Projection definitions for monitoring and manual data curation.
          • Subgroups and cohorts of interest for monitoring and offline testing.
          • Data curation logic.
          • Datasets for training and evaluation.
          • Model comparison reports.

          Walk through this example to understand how changes to the retraining strategy occur as issues surface in our machine learning system.

          "},{"location":"course/2022/lecture-6-continual-learning/#7-takeaways","title":"7 - Takeaways","text":"

          To summarize, continual learning is a nascent, poorly understood topic that is worth continuing to pay attention to. Watch this space! In this lecture, we focused on all the steps and techniques that allow you to use retraining effectively. As MLEs, leverage monitoring to strategically improve your model. Always start simple, and get better!

          "},{"location":"course/2022/lecture-7-foundation-models/","title":"Lecture 7: Foundation Models","text":"

          Lecture by Sergey Karayev. Notes by James Le and Vishnu Rachakonda. Published September 19, 2022. Download slides.

          Foundation models are very large models trained on very large datasets that can be used for multiple downstream tasks.

          We\u2019ll talk about fine-tuning, Transformers, large language models, prompt engineering, other applications of large models, and vision and text-based models like CLIP and image generation.

          "},{"location":"course/2022/lecture-7-foundation-models/#1-fine-tuning","title":"1 - Fine-Tuning","text":"

          Traditional ML uses a lot of data and a large model, which takes a long time. But if you have a small amount of data, you can use transfer learning to benefit from the training on a lot of data. You basically use the same model that you have pre-trained, add a few layers, and unlock some weights.

          We have been doing this in computer vision since 2014. Usually, you train a model on ImageNet, keep most of the layers, and replace the top three or so layers with newly learned weights. Model Zoos are full of these models like AlexNet, ResNet, etc. in both TensorFlow and PyTorch.

          In NLP, pre-training was initially limited only to the first step: word embeddings. The input to a language model is words. One way you can encode them to be a vector (instead of a word) is one-hot encoding. Given a large matrix of words, you can make an embedding matrix and embed each word into a real-valued vector space. This new matrix is down to the dimension on the order of a thousand magnitude. Maybe those dimensions correspond to some semantic notion.

          Word2Vec trained a model like this in 2013. It looked at which words frequently co-occur together. The learning objective was to maximize cosine similarity between their embeddings. It could do cool demos of vector math on these embeddings. For example, when you embed the words \u201cking,\u201d \u201cman,\u201d and \u201cwoman,\u201d you can do vector math to get a vector that is close to the word \u201cqueen\u201d in this embedding space.

          It\u2019s useful to see more context to embed words correctly because words can play different roles in the sentence (depending on their context). If you do this, you\u2019ll improve accuracy on all downstream tasks. In 2018, a number of models such as ELMO and ULMFit published pre-trained LSTM-based models that set state-of-the-art results on most NLP tasks.

          But if you look at the model zoos today, you won\u2019t see any LSTMs. You\u2019ll only see Transformers everywhere. What are they?

          "},{"location":"course/2022/lecture-7-foundation-models/#2-transformers","title":"2 - Transformers","text":"

          Transformers come from a paper called \u201cAttention Is All You Need\u201d in 2017, which introduced a groundbreaking architecture that sets state-of-the-art results on translation first and a bunch of NLP tasks later.

          It has a decoder and an encoder. For simplicity, let\u2019s take a look at the encoder. The interesting components here are self-attention, positional encoding, and layer normalization.

          "},{"location":"course/2022/lecture-7-foundation-models/#self-attention","title":"Self-Attention","text":"

          Basic self-attention follows: Given an input sequence of vectors x of size t, we will produce an output sequence of tensors of size t. Each tensor is a weighted sum of the input sequence. The weight here is just a dot product of the input vectors. All we have to do is to make that weighted vector sum to 1. We can represent it visually, as seen below. The input is a sentence in English, while the output is a translation in French.

          So far, there are no learned weights and no sequence order. Let\u2019s learn some weights! If we look at the input vectors, we use them in three ways: as queries to compare two other input vectors, as keys to compare them to input vectors and produce the corresponding output vector, and as values to sum up all the input vectors and produce the output vector. * We can process each input vector with three different matrices to fulfill these roles of query, key, and value. We will have three weighted matrices, and everything else remains the same. If we learn these matrices, we learn attention. * It\u2019s called multi-head attention *because we learn different sets of weighted matrices simultaneously, but we implement them as just a single matrix.

          So far, we have learned the query, key, and value. Now we need to introduce some notion of order to the sequence by encoding each vector with its position. This is called positional encoding.

          "},{"location":"course/2022/lecture-7-foundation-models/#positional-encoding","title":"Positional Encoding","text":"

          Let\u2019s say we have an input sequence of words

          ]* The first step is to embed the words into a dense, real-valued word embedding. This part can be learned. * However, there is no order to that embedding. Thus, we will add another embedding that only encodes the position. * In brief, the first embedding encodes only the content, while the second embedding encodes only the position. If you add them, you now have information about both the content and the position.

          "},{"location":"course/2022/lecture-7-foundation-models/#layer-normalization","title":"Layer Normalization","text":"

          Neural network layers work best when the input vectors have uniform mean and standard deviation in each dimension. As activations flow through the network, the means and standard deviations get blown out by the weight matrices. Layer normalization is a hack to re-normalize every activation to where we want them between each layer.

          That\u2019s it! All the amazing results you\u2019ll see from now on are just increasingly large Transformers with dozens of layers, dozens of heads within each layer, large embedding dimensions, etc. The fundamentals are the same. It\u2019s just the Transformer model.

          Anthropic has been publishing great work lately to investigate why Transformers work so well. Check out these publications:

          1. A Mathematical Framework for Transformer Circuits
          2. In-Context Learning and Induction Heads
          3. Toy Models of Superposition
          "},{"location":"course/2022/lecture-7-foundation-models/#3-large-language-models","title":"3 - Large Language Models","text":""},{"location":"course/2022/lecture-7-foundation-models/#models","title":"Models","text":"

          GPT and GPT-2 came out in 2018 and 2019, respectively. The name means \u201cgenerative pre-trained Transformers.\u201d They are decoder-only models and use masked self-attention. This means: At a poi that at the output sequence, you can only attend to two input sequence vectors that came before that point in the sequence.

          These models were trained on 8 million web pages. The largest model has 1.5 billion parameters. The task that GPT-2 was trained on is predicting the next word in all of this text on the web. They found that it works increasingly well with an increasing number of parameters.

          BERT came out around the same time as Bidirectional Encoder Representations for Transformers. It is encoder-only and does not do attention masking. It has 110 million parameters. During training, BERT masks out random words in a sequence and has to predict whatever the masked word is.

          T5 (Text-to-Text Transformer) came out in 2020. The input and output are both text strings, so you can specify the task that the model supposes to be doing. T5 has an encoder-decoder architecture. It was trained on the C4 dataset (Colossal Clean Crawled Corpus), which is 100x larger than Wikipedia. It has around 10 billion parameters. You can download the open-sourced model and run it on your machine.

          GPT-3 was one of the state-of-the-art models in 2020. It was 100x larger than GPT/GPT-2 with 175 billion parameters. Because of its size, GPT-3 exhibits unprecedented capabilities of few-shot and zero-shot learning. As seen in the graph below, the more examples you give the model, the better its performance is. The larger the model is, the better its performance is. If a larger model was trained, it would be even better.

          OpenAI also released Instruct-GPT earlier this year. It had humans rank different GPT-3 outputs and used reinforcement learning to fine-tune the model. Instruct-GPT was much better at following instructions. OpenAI has put this model, titled \u2018text-davinci-002,\u2019 in their API. It is unclear how big the model is. It could be ~10x smaller than GPT-3.

          DeepMind released RETRO (Retrieval-Enhanced Transformers) in 2021. Instead of learning language and memorizing facts in the model\u2019s parameters, why don\u2019t we just learn the language in parameters and retrieve facts from a large database of internal text? To implement RETRO, they encode a bunch of sentences with BERT and store them in a huge database with more than 1 trillion tokens. At inference time, they fetch matching sentences and attend to them. This is a powerful idea because RETRO is connected to an always updated database of facts.

          DeepMind released another model called Chinchilla in 2022, which observed the scaling laws of language models. They trained over 400 language models from 70 million to 16 billion parameters on 5 billion to 500 billion tokens. They then derived formulas for optimal model and training set size, given a fixed compute budget. They found that most large language models are \u201cundertrained,\u201d meaning they haven\u2019t seen enough data.

          To prove this, they trained a large model called Gopher with 280 billion parameters and 300 billion tokens. With Chincilla, they reduced the number of parameters to 70 billion and used four times as much data (1.4 trillion tokens). Chinchilla not only matched Gopher\u2019s performance but actually exceeded it. Check out this LessWrong post if you want to read about people\u2019s opinions on it.

          "},{"location":"course/2022/lecture-7-foundation-models/#vendors","title":"Vendors","text":"

          OpenAI offers four model sizes: Ada, Babbage, Curie, and Davinci. Each has a different price and different capabilities. Most of the impressive GPT-3 results on the Internet came from Davinci. These correspond to 350M, 1.3B, 6.7B, and 175B parameters. You can also fine-tune models for an extra cost. The quota you get when you sign up is pretty small, but you can raise it over time. You have to apply for review before going into production.

          There are some alternatives to OpenAI:

          1. Cohere AI has similar models for similar prices.
          2. AI21 also has some large models.
          3. There are also open-source large language models, such as Eleuther GPT-NeoX (20B parameters), Facebook OPT-175B (175B parameters), and BLOOM from BigScience (176B parameters). If you want to use one of these open-source models but do not have to be responsible for deploying it, you can use HuggingFace\u2019s inference API.
          "},{"location":"course/2022/lecture-7-foundation-models/#4-prompt-engineering","title":"4 - Prompt Engineering","text":"

          GPT-3 and other large language models are mostly alien technologies. It\u2019s unclear how they exactly work. People are finding out how they work by playing with them. We will cover some notable examples below. Note that if you play around with them long enough, you are likely to discover something new.

          GPT-3 is surprisingly bad at reversing words due to tokenization: It doesn\u2019t see letters and words as humans do. Instead, it sees \u201ctokens,\u201d which are chunks of characters. Furthermore, it gets confused with long-ish sequences. Finally, it has trouble merging characters. For it to work, you have to teach GPT-3 the algorithm to use to get around its limitations. Take a look at this example from Peter Welinder.

          Another crazy prompt engineering is \u201cLet\u2019s Think Step By Step.\u201d This comes from a paper called \u201cLarge Language Models are Zero-Shot Reasoners.\u201d Simply adding \u201cLet\u2019s Think Step By Step\u201d into the prompt increases the accuracy of GPT-3 on one math problem dataset from 17% to 78% and another math problem dataset from 10% to 40%.

          Another unintuitive thing is that the context length of GPT is long. You can give it a long instruction and it can return the desired output. This example shows how GPT can output a CSV file and write the Python code as stated. You can also use formatting tricks to reduce the training cost, as you can do multiple tasks per call. Take a look at this example for inspiration.

          We have to be careful since our models might get pwnage or possessed. User input in the prompt may instruct the model to do something naughty. This input can even reveal your prompt to prompt injection attacks and possess your AI. This actually works in GPT-3-powered production apps.

          Further work is needed before putting GPT-3-powered apps into production. There are some tools for prompt engineering such as PromptSource and OpenPrompt, but we definitely need better tools.

          "},{"location":"course/2022/lecture-7-foundation-models/#5-other-applications","title":"5 - Other Applications","text":""},{"location":"course/2022/lecture-7-foundation-models/#code","title":"Code","text":"

          One notable application of large foundation models is code generation. With a 40- billion-parameter Transformer model pre-trained on all the Github code it could find, DeepMind Alphacode was able to achieve an above-average score on the Codeforce competition. To do this, they used a model to generate a large set of potential solutions and another model to winnow down the options by actually executing them.

          The general idea to highlight from this is filtering the outputs of a model. You can have a separate model that does filtering, or you can have some kind of verification + validation process. This can really significantly boost accuracy. OpenAI demonstrates impressive results on different math word problems, as seen below.

          Code generation has moved into products of late, like Github Copilot. We highly recommend trying it out! Another option for a similar tool is replit\u2019s new tool for coding.

          We\u2019re just getting started with the applications of foundation models to the programming workflow. In fact, things are about to start getting really wild. A recent paper showed that a large language model that generated its own synthetic puzzles to learn to code could improve significantly. Models are teaching themselves to get better!

          Playing around with systems like GPT-3 and their ability to generate code can feel quite remarkable! Check out some fun experiments Sergey ran (here and here).

          "},{"location":"course/2022/lecture-7-foundation-models/#semantic-search","title":"Semantic Search","text":"

          Semantic search is another interesting application area. If you have texts like words, sentences, paragraphs, or even whole documents, you can embed that text with large language models to get vectors. If you have queries in sentences or paragraphs, you can also embed them in the same way. With this function, you can generate embeddings and easily find semantic overlap by examining the cosine similarity between embedding vectors.

          Implementing this semantic search is hard. Computations on large, dense vectors with float data types are intensive. Companies like Google and Facebook that use this approach have developed libraries like FAISS and ScaNN to solve the challenges of implementing semantic search.

          Some open-source options for this include Haystack from DeepSet and Jina.AI. Other vendor options include Pinecone, Weaviate, Milvus, Qdrant, Google Vector AI Matching Engine, etc.

          "},{"location":"course/2022/lecture-7-foundation-models/#going-cross-modal","title":"Going Cross-Modal","text":"

          Newer models are bridging the gap between data modalities (e.g. using both vision and text). One such model is the Flamingo model, which uses a special model component called a perceiver resampler (an attention module that translates images into fixed-length sequences of tokens).

          Another paper about Socratic Models was recently published. The author trained several large models (a vision model, a language model, and an audio model) that are able to interface with each other using language prompts to perform new tasks.

          Finally, the concept of \u201cFoundation Models\u201d came from the paper \u201cOn the Opportunities and Risks of Foundation Models\u201d by researchers at Stanford Institute for Human-Centered AI. We think \u201cLarge Language Models\u201d or \u201cLarge Neural Networks\u201d might be more useful terms.

          "},{"location":"course/2022/lecture-7-foundation-models/#6-clip-and-image-generation","title":"6 - CLIP and Image Generation","text":"

          Now, let's talk about some of the most exciting applications of this kind of model: in vision!

          In a 2021 OpenAI paper called \u201cLearning transferrable visual models from natural language supervision\u201d, CLIP (Contrastive Language\u2013Image Pre-training) was introduced. In this paper, the authors encode text via Transforms, encode images via ResNets or Visual Transformers, and apply contrastive training to train the model. Contrastive training matches correct image and text pairs using cosine similarity. The code for this is tremendously simple!

          With this powerful trained model, you can map images and text using embeddings, even on unseen data. There are two ways of doing this. One is to use a \u201clinear probe\u201d by training a simple logistic regression model on top of the features CLIP outputs after performing inference. Otherwise, you can use a \u201czero-shot\u201d technique that encodes all the text labels and compares them to the encoded image. Zero-shot tends to be better, but not always.

          Since OpenAI CLIP was released in an open-source format, there have been many attempts to improve it, including the OpenCLIP model, which actually outperforms CLIP.

          To clarify, CLIP doesn\u2019t go directly from image to text or vice versa. It uses embeddings. This embedding space, however, is super helpful for actually performing searches across modalities. This goes back to our section on vector search. There are so many cool projects that have come out of these efforts! (like this and this)

          To help develop mental models for these operations, consider how to actual perform image captioning (image -> text) and image generation (text -> image). There are two great examples of this written in the ClipCap paper. At a high level, image captioning is performed through training a separate model to mediate between a frozen CLIP, which generates a series of word embeddings, and a frozen GPT-2, which takes these word embeddings and generates texts.

          The intermediate model is a Transformer model that gets better at modeling images and captions.

          In image generation, the most well-known approach is taken by DALL-E 2 or unCLIP. In this method, two additional components are introduced to a CLIP system, a prior that maps from text embedding to image embeddings and a decoder that maps from image embedding to image. The prior exists to solve the problem that many text captions can accurately work for an image.

          In DALL-E 2\u2019s case, they use an approach for the prior called a diffusion model. Diffusion models are trained to denoise data effectively through training on incrementally noisy data.

          In DALL-E 2, the diffusion method is applied to the prior model, which trains its denoising approach on a sequence of encoded text, CLIP text embedding, the diffusion timestamp, and the noised CLIP embedding, all so it can predict the un-noised CLIP image embedding. In doing so, it helps us bridge the gap between the raw text caption to the model, which can be infinitely complicated and \u201cnoisy\u201d, and the CLIP image embedding space.

          The decoder helps us go from the prior\u2019s output of an image embedding to an image. This is a much simpler approach for us to understand. We apply a U-Net structure to a diffusion training process that is able to ultimately \u201cde-noise\u201d the input image embedding and output an image.

          The results of this model are incredible! You can even generate images and merge images using CLIP embeddings. There are all kinds of funky ways of playing with the embeddings to create various image outputs.

          Other models of interest are Parti and StableDiffusion.

          • Google published Parti soon after DALL-E 2. Parti uses a VQGAN model instead of a diffusion model, where the image is represented as a sequence of high-dimensional tokens.
          • StableDiffusion has been released publicly, so definitely check it out! It uses a \u201clatent diffusion\u201d model, which diffuses the image in a low-dimensional latent space and decodes the image back into a pixel space.

          There has been an absolute explosion of these applications. Check out these examples on image-to-image, video generation, and photoshop plugins. The sky is the limit.

          Prompting these models is interesting and can get pretty involved. Someday this may even be tool and code-based. You can learn from other people on Lexica and promptoMANIA.

          It\u2019s truly a remarkable time to be involved with AI models as they scale to new heights.

          "},{"location":"course/2022/lecture-8-teams-and-pm/","title":"Lecture 8: ML Teams and Project Management","text":"

          Lecture by Josh Tobin. Notes by James Le and Vishnu Rachakonda. Published September 26, 2022. Download slides.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#0-why-is-this-hard","title":"0 - Why is this hard?","text":"

          Building any product is hard:

          • You have to hire great people.

          • You have to manage and develop those people.

          • You have to manage your team's output and make sure your vectors are aligned.

          • You have to make good long-term technical choices and manage technical debt.

          • You have to manage expectations from leadership.

          • You have to define and communicate requirements with stakeholders.

          Machine Learning (ML) adds complexity to that process:

          • ML talent is expensive and scarce.

          • ML teams have a diverse set of roles.

          • Projects have unclear timelines and high uncertainty.

          • The field is moving fast, and ML is the \"high-interest credit card of technical debt.\"

          • Leadership often doesn't understand ML.

          • ML products fail in ways that are hard for laypeople to understand.

          In this lecture, we'll talk about:

          1. ML-related roles and their required skills.

          2. How to hire ML engineers (and how to get hired).

          3. How ML teams are organized and fit into the broader organization.

          4. How to manage an ML team and ML products.

          5. Design considerations for ML products.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#1-roles","title":"1 - Roles","text":""},{"location":"course/2022/lecture-8-teams-and-pm/#common-roles","title":"Common Roles","text":"

          Let's look at the most common ML roles and the skills they require:

          • The ML Product Manager works with the ML team, other business functions, the end-users, and the data owners. This person designs docs, creates wireframes, and develops a plan to prioritize and execute ML projects.

          • The MLOps/ML Platform Engineer builds the infrastructure to make models easier and more scalable to deploy. This person handles the ML infrastructure that runs the deployed ML product using platforms like AWS, GCP, Kafka, and other ML tooling vendors.

          • The ML Engineer trains and deploys prediction models. This person uses tools like TensorFlow and Docker to work with prediction systems running on real data in production.

          • The ML Researcher trains prediction models, often those that are forward-looking or not production-critical. This person uses libraries like TensorFlow and PyTorch on notebook environments to build models and reports describing their experiments.

          • The Data Scientist is a blanket term used to describe all of the roles above. In some organizations, this role entails answering business questions via analytics. This person can work with wide-ranging tools from SQL and Excel to Pandas and Scikit-Learn.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#skills-required","title":"Skills Required","text":"

          What skills are needed for these roles? The chart below displays a nice visual - where the horizontal axis is the level of ML expertise and the size of the bubble is the level of communication and technical writing (the bigger, the better).

          • The MLOps is primarily a software engineering role, which often comes from a standard software engineering pipeline.

          • The ML Engineer requires a rare mix of ML and Software Engineering skills. This person is either an engineer with significant self-teaching OR a science/engineering Ph.D. who works as a traditional software engineer after graduate school.

          • The ML Researcher is an ML expert who usually has an MS or Ph.D. degree in Computer Science or Statistics or finishes an industrial fellowship program.

          • The ML Product Manager is just like a traditional Product Manager but with a deep knowledge of the ML development process and mindset.

          • The Data Scientist role constitutes a wide range of backgrounds, from undergraduate to Ph.D. students.

          There is an important distinction between a task ML engineer and a platform ML engineer, coined by Shreya Shankar in this blog post:

          1. Task ML engineers are responsible for maintaining specific ML pipelines. They only focus on ensuring that these ML models are healthy and updated frequently. They are often overburdened.

          2. Platform ML engineers help task ML engineers automate tedious parts of their jobs. They are called MLOps/ML Platform engineers in our parlance.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#2-hiring","title":"2 - Hiring","text":""},{"location":"course/2022/lecture-8-teams-and-pm/#the-ai-talent-gap","title":"The AI Talent Gap","text":"

          In 2018 (when we started FSDL), the AI talent gap was the main story. There were so few people who understood this technology, so the biggest block for organizations was that they couldn't find people who were good at ML.

          In 2022, the AI talent gap persists. But it tends to be less of a blocker than it used to be because we have had four years of folks switching careers into ML and software engineers emerging from undergraduate with at least a couple of ML classes under their belts.

          The gap tends to be in folks that understand more than just the underlying technology but also have experience in seeing how ML fails and how to make ML successful when it's deployed. That's the reality of how difficult it is to hire ML folks today, especially those with production experience.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#sourcing","title":"Sourcing","text":"

          Because of this shallow talent pool and the skyrocketing demand, hiring for ML positions is pretty hard. Typical ML roles come in the following structure:

          • ML Adjacent roles: ML product manager, DevOps, Data Engineer

          • Core ML Roles: ML Engineer, ML Research/ML Scientist

          • Business analytics roles: Data Scientist

          For ML-adjacent roles, traditional ML knowledge is less important, as demonstrated interest, conversational understanding, and experience can help these professionals play an impactful role on ML teams. Let's focus on how to hire for the core ML roles.

          While there's no perfect way to hire ML engineers, there's definitely a wrong way to hire them, with extensive job descriptions that demand only the best qualifications (seen above). Certainly, there are many good examples of this bad practice floating around.

          • Rather than this unrealistic process, consider hiring for software engineering skills, an interest in ML, and a desire to learn. You can always train people in the art and science of ML, especially when they come with strong software engineering fundamentals.

          • Another option is to consider adding junior talent, as many recent grads come out with good ML knowledge nowadays.

          • Finally, and most importantly, be more specific about what you need the position and professional to do. It's impossible to find one person that can do everything from full-fledged DevOps to algorithm development.

          To hire ML researchers, here are our tips:

          • Evaluate the quality of publications, over the quantity, with an eye toward the originality of the ideas, the execution, etc.

          • Prioritize researchers that focus on important problems instead of trendy problems.

          • Experience outside academia is also a positive, as these researchers may be able to transition to industry more effectively.

          • Finally, keep an open mind about research talent and consider talented people without PhDs or from adjacent fields like physics, statistics, etc.

          To find quality candidates for these roles, here are some ideas for sourcing:

          • Use standard sources like LinkedIn, recruiters, on-campus recruiting, etc.

          • Monitor arXiv and top conferences and flag the first authors of papers you like.

          • Look for good implementations of papers you like.

          • Attend ML research conferences (NeurIPS, ICML, ICLR).

          As you seek to recruit, stay on top of what professionals want and make an effort to position your company accordingly. ML practitioners want to be empowered to do great work with interesting data. Building a culture of learning and impact can help recruit the best talent to your team. Additionally, sell sell sell! Talent needs to know how good your team is and how meaningful the mission can be.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#interviewing","title":"Interviewing","text":"

          As you interview candidates for ML roles, try to validate your hypotheses of their strengths while testing a minimum bar on weaker aspects. For example, ensure ML researchers can think creatively about new ML problems while ensuring they meet a baseline for code quality. It's essential to test ML knowledge and software engineering skills for all industry professionals, though the relative strengths can vary.

          The actual ML interview process is much less well-defined than software engineering interviews, though it is modeled off of it. Some helpful inclusions are projects or exercises that test the ability to work with ML-specific code, like take-home ML projects. Chip Huyen's \"Introduction to ML Interviews Book\" is a great resource.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#finding-a-job","title":"Finding A Job","text":"

          To find an ML job, you can take a look at the following sources:

          • Standard sources such as LinkedIn, recruiters, on-campus recruiting, etc.

          • ML research conferences (NeurIPS, ICLR, ICML).

          • Apply directly (remember, there's a talent gap!).

          Standing out for competitive roles can be tricky! Here are some tips (in increasing order of impressiveness) that you can apply to differentiate yourself:

          1. Exhibit ML interest (e.g., conference attendance, online course certificates, etc.).

          2. Build software engineering skills (e.g., at a well-known software company).

          3. Show you have a broad knowledge of ML (e.g., write blog posts synthesizing a research area).

          4. Demonstrate ability to get ML projects done (e.g., create side projects, re-implement papers).

          5. Prove you can think creatively in ML (e.g., win Kaggle competitions, publish papers).

          "},{"location":"course/2022/lecture-8-teams-and-pm/#3-organizations","title":"3 - Organizations","text":""},{"location":"course/2022/lecture-8-teams-and-pm/#organization-archetypes","title":"Organization Archetypes","text":"

          There exists not yet a consensus on the right way to structure an ML team. Still, a few best practices are contingent upon different organization archetypes and their ML maturity level. First, let's see what the different ML organization archetypes are.

          Archetype 1 - Nascent and Ad-Hoc ML

          • These are organizations where no one is doing ML, or ML is done on an ad-hoc basis. Obviously, there is little ML expertise in-house.

          • They are either small-to-medium businesses or less technology-forward large companies in industries like education or logistics.

          • There is often low-hanging fruit for ML.

          • But there is little support for ML projects, and it's challenging to hire and retain good talent.

          Archetype 2 - ML R&D

          • These are organizations in which ML efforts are centered in the R&D arm of the organization. They often hire ML researchers and doctorate students with experience publishing papers.

          • They are larger companies in sectors such as oil and gas, manufacturing, or telecommunications.

          • They can hire experienced researchers and work on long-term business priorities to get big wins.

          • However, it is very difficult to get quality data. Most often, this type of research work rarely translates into actual business value, so usually, the amount of investment remains small.

          Archetype 3 - ML Embedded Into Business and Product Teams

          • These are organizations where certain product teams or business units have ML expertise alongside their software or analytics talent. These ML individuals report up to the team's engineering/tech lead.

          • They are either software companies or financial services companies.

          • ML improvements are likely to lead to business value. Furthermore, there is a tight feedback cycle between idea iteration and product improvement.

          • Unfortunately, it is still very hard to hire and develop top talent, and access to data and compute resources can lag. There are also potential conflicts between ML project cycles and engineering management, so long-term ML projects can be hard to justify.

          Archetype 4 - Independent ML Function

          • These are organizations in which the ML division reports directly to senior leadership. The ML Product Managers work with Researchers and Engineers to build ML into client-facing products. They can sometimes publish long-term research.

          • They are often large financial services companies.

          • Talent density allows them to hire and train top practitioners. Senior leaders can marshal data and compute resources. This gives the organizations to invest in tooling, practices, and culture around ML development.

          • A disadvantage is that model handoffs to different business lines can be challenging since users need the buy-in to ML benefits and get educated on the model use. Also, feedback cycles can be slow.

          Archetype 5 - ML-First Organizations

          • These are organizations in which the CEO invests in ML, and there are experts across the business focusing on quick wins. The ML division works on challenging and long-term projects.

          • They are large tech companies and ML-focused startups.

          • They have the best data access (data thinking permeates the organization), the most attractive recruiting funnel (challenging ML problems tends to attract top talent), and the easiest deployment procedure (product teams understand ML well enough).

          • This type of organization archetype is hard to implement in practice since it is culturally difficult to embed ML thinking everywhere.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#team-structure-design-choices","title":"Team Structure Design Choices","text":"

          Depending on the above archetype that your organization resembles, you can make the appropriate design choices, which broadly speaking follow these three categories:

          1. Software Engineer vs. Research: To what extent is the ML team responsible for building or integrating with software? How important are Software Engineering skills on the team?

          2. Data Ownership: How much control does the ML team have over data collection, warehousing, labeling, and pipelining?

          3. Model Ownership: Is the ML team responsible for deploying models into production? Who maintains the deployed models?

          Below are our design suggestions:

          If your organization focuses on ML R&D:

          • Research is most definitely prioritized over Software Engineering skills. Because of this, there would potentially be a lack of collaboration between these two groups.

          • ML team has no control over the data and typically will not have data engineers to support them.

          • ML models are rarely deployed into production.

          If your organization has ML embedded into the product:

          • Software Engineering skills will be prioritized over Research skills. Often, the researchers would need strong engineering skills since everyone would be expected to product-ionize his/her models.

          • ML teams generally do not own data production and data management. They will need to work with data engineers to build data pipelines.

          • ML engineers totally own the models that they deploy into production.

          If your organization has an independent ML division:

          • Each team has a potent mix of engineering and research skills; therefore, they work closely together within teams.

          • ML team has a voice in data governance discussions, as well as a robust data engineering function.

          • ML team hands-off models to users but is still responsible for maintaining them.

          If your organization is ML-First:

          • Different teams are more or less research-oriented, but in general, research teams collaborate closely with engineering teams.

          • ML team often owns the company-wide data infrastructure.

          • ML team hands the models to users, who are responsible for operating and maintaining them.

          The picture below neatly sums up these suggestions:

          "},{"location":"course/2022/lecture-8-teams-and-pm/#4-managing","title":"4 - Managing","text":""},{"location":"course/2022/lecture-8-teams-and-pm/#managing-ml-teams-is-challenging","title":"Managing ML Teams Is Challenging","text":"

          The process of actually managing an ML team is quite challenging for four reasons:

          1. Engineering Estimation: It's hard to know how easy or hard an ML project is in advance. As you explore the data and experiment with different models, there is enormous scope for new learnings about the problem that materially impact the timeline. Furthermore, knowing what methods will work is often impossible. This makes it hard to say upfront how long or how much work may go into an ML project.

          2. Nonlinear Progress: As the chart below from a blog post by Lukas Biewald (CEO of Weights and Biases) shows, progress on ML projects is unpredictable over time, even when the effort expended grows considerably. It's very common for projects to stall for extended periods of time.

          1. Cultural gaps: The relative culture of engineering and research professionals is very different. Research tends to favor novel, creative ideas, while engineering prefers tried and true methods that work. As a result, ML teams often experience a clash of cultures, which can turn toxic if not appropriately managed. A core challenge of running ML teams is addressing the cultural barriers between ML and software engineering so that teams can harmoniously experiment and deliver ML products.

          2. Leadership Deficits: It's common to see a lack of detailed understanding of ML at senior levels of management in many companies. As a result, expressing feasibility and setting the right expectations for ML projects, especially high-priority ones, can be hard.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#how-to-manage-ml-teams-better","title":"How To Manage ML Teams Better","text":"

          Managing ML teams is hardly a solved problem, but you can take steps to improve the process.

          Plan probabilistically

          Many engineering projects are managed in a waterfall fashion, with the sequential tasks defined up front clearly. Instead of forcing this method of engineering management on difficult ML projects, try assigning a likelihood of success to different tasks to better capture the experimental process inherent to ML engineering. As these tasks progress or stall, rapidly re-evaluate your task ordering to better match what is working. Having this sense of both (1) how likely a task is to succeed and (2) how important it is makes project planning considerably more realistic.

          Have a portfolio of approaches

          Embrace multiple ideas and approaches to solve crucial research challenges that gate production ML. Don't make your plan dependent on one approach working!

          Measure inputs, not results

          As you work through several approaches in your portfolio, do not overly emphasize whose ideas ultimately work as a reflection of contribution quality. This can negatively impact team members' creativity, as they focus more on trying to find only what they currently think could work, rather than experimenting in a high-quality fashion (which is ultimately what leads to ML success).

          Have researchers and engineers work together

          The collaboration between engineering and research is essential for quality ML products to get into production. Emphasize collaboration across the groups and professionals!

          Get quick wins

          Taking this approach makes it more likely that your ML project will succeed in the long term. It allows you to demonstrate progress to your leadership more effectively and clearly.

          Educate leadership on uncertainty

          This can be hard, as leadership is ultimately accountable for addressing blind spots and understanding timeline risk. There are things you can do, however, to help improve leadership's knowledge about ML timelines.

          • Avoid building hype around narrow progress metrics material only to the ML team (e.g., \"We improved F1 score by 0.2 and have achieved awesome performance!\").

          • Instead, be realistic, communicate risk, and emphasize real product impact (e.g., \"Our model improvements should increase the number of conversions by 10%, though we must continue to validate its performance on additional demographic factors.)

          • Sharing resources like this a16z primer, this class from Prof. Pieter Abbeel, and this Google's People + AI guidebook can increase awareness of your company's leadership.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#ml-pms-are-well-positioned-to-educate-the-organization","title":"ML PMs are well-positioned to educate the organization","text":"

          There are two types of ML product managers.

          1. Task PMs: These are the more common form of ML PM. They are generally specialized into a specific product area (e.g. trust and safety) and have a strong understanding of the particular use case.

          2. Platform PMs: These are a newer form of PMs. They have a broader mandate to ensure that the ML team (generally centralized in this context) is highest leverage. They manage workflow and priorities for this centralized team. To support this, they tend to have a broad understanding of ML themselves. These PMs are critical for educating the rest of the company about ML and ensuring that teams trust the output of models.

          Both types of PMs are crucial for ML success. Platform PMs tend to have a particularly powerful role to play in pushing an organization's adoption of machine learning and making it successful.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#what-is-agile-for-ml","title":"What is \"Agile\" for ML?","text":"

          There are two options similar to what Agile is for software development in the ML context. They are shown below:

          They are both structured, data-science native approaches to project management. You can use them to provide standardization for project stages, roles, and artifacts.

          TDSP tends to be more structured and is a strong alternative to the Agile methodology. CRISP-DM is somewhat higher level and does not provide as structured a project management workflow. If you genuinely have a large-scale coordination problem, you can try these frameworks, but don't otherwise. They can slow you down since they are more oriented around \"traditional\" data science and not machine learning.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#5-design","title":"5 - Design","text":"

          Let's talk about how to actually design machine learning products now. The biggest challenge with designing such products often isn't implementing them; it's bridging the gap between users' inflated expectations and the reality.

          Users often expect extremely sophisticated systems capable of solving many more problems than they actually can.

          In reality, machine learning systems are more like dogs that are trained to do a special task; weird little guys with a penchant for distraction and an inability to do much more than they are explicitly told.

          All this leads to a big gap between what can be done and what users expect!

          "},{"location":"course/2022/lecture-8-teams-and-pm/#the-keys-to-good-ml-product-design","title":"The Keys to Good ML Product Design","text":"

          In practice, good ML product design bridges users expectations and reality. If you can help users understand the benefits and limitations of the model, they tend to be more satisfied. Furthermore, always have backup plans for model failures! Over-automating systems tends to be a recipe for unhappy users. Finally, building in feedback loops can really increase satisfaction over time.

          There are a couple ways to explain the benefits and limitations of an ML system to users.

          • Focus on the problems it solves, not the fact that the system is \"AI-powered\".

          • If you make the system feel \"human-like\" (unconstrained input, human-like responses), expect users to treat it as human-like.

          • Furthermore, seek to include guardrails or prescriptive interfaces over open-ended, human-like experiences. A good example of the former approach is Amazon Alexa, which has specific prompts that its ML system responds to.

          Handling failures is a key part of keeping ML systems users happy. There's nothing worse than a \"smart\" system that conks out when you do something slightly unexpected. Having built-in solutions to solve for automation issues is extremely important. One approach is letting users be involved to correct improper responses. Another is to focus on the notion of \"model confidence\" and only offer responses when the threshold is met. A good example of a handling failure approach is how Facebook recommends photo tags for users, but doesn't go so far as to autoassign.

          "},{"location":"course/2022/lecture-8-teams-and-pm/#types-of-user-feedback","title":"Types of User Feedback","text":"

          How can you collect feedback from users in a way that avoids these issues? There are different types of user feedback and how they help with model improvement.

          Let's go across this chart.

          1. The simplest form of feedback is indirect implicit feedback. For example, did the user churn from the product? That tells you immediately how the user felt about the system without them giving a clear signal themselves.

          2. Another form is direct implicit feedback, which involves the user \"taking the next step\". For example, in an automated user onboarding flow, did the user click through into ensuing steps? This is trickier to implement, but can be useful for future training iterations.

          3. The next type of feedback is binary explicit feedback, wherein users are specifically asked (e.g. via thumbs up/down buttons) how they feel about the model performance.

          4. You can make this more sophisticated and add categorical explicit feedback, which allows users to sort their feedback into various types.

          5. To really get a sense of how users feel, consider offering free text feedback. This is tricky to use for model training and can be involved for users, but it's very useful to highlight the highest friction predictions.

          6. The gold standard, of course, are model corrections; they are free labels!

          Whenever building explicit feedback into ML systems, avoid relying on users' altruism and be clear about why they should engage in the feedback. Instead, build positive feedback loops by allowing users to experience the benefits of their feedback quickly.

          Great ML product experiences are designed from scratch. ML is a very specific technology with clear advantages and drawbacks. Design needs to be thoughtfully executed around these products. It's especially important to allow users to interact safely with ML products that may fail in unexpected ways. Always try to find ways to build in feedback loops to make the ML product better over time.

          There are tons of resources that can help you get started with this emerging field.

          • Google's People + AI Guidebook

          • Guidelines for Human-AI Interaction

          • Agency Plus Automation: Designing AI into Interactive Systems

          • Designing Collaborative AI

          In conclusion, we talked through a number of adjacent considerations to building ML systems and products. In short, you ship the team as much you do the code; be thoughtful about how you hire, manage, and structure ML teams as much as ML products!

          "},{"location":"course/2022/lecture-9-ethics/","title":"Lecture 9: Ethics","text":"

          Lecture by Charles Frye. Notes by James Le and Vishnu Rachakonda. Published October 03, 2022. Download slides.

          In this final lecture of FSDL 2022, we'll talk about ethics. After going through the context of what we mean by ethics, we'll go through three different areas where ethics come up:

          1. Tech Ethics: ethics that anybody who works in the tech industry broadly needs to think about.

          2. ML Ethics: what ethics has specifically meant for the ML industry.

          3. AI Ethics: what ethics might mean in the future where true AGI exists.

          "},{"location":"course/2022/lecture-9-ethics/#1-overview-and-context","title":"1 - Overview and Context","text":"

          All ethics lectures are wrong, but some are useful. They are more useful if we admit and state what our assumptions or biases are. We'll also talk about three general themes that come up often when ethical concerns are raised in tech/ML: alignment, trade-offs, and humility.

          In this lecture, we'll approach ethics on the basis of concrete cases - specific instances where people have raised concerns. We'll talk about cases where people have taken actions that have led to claims and counter-claims of ethical or unethical behavior - such as the use of automated weapons, the use of ML systems to make decisions like sentencing and bail, and the use of ML algorithms to generate art. In each case when criticism has been raised, part of that criticism has been that the technology is unethical.

          Approaching ethics in this way allows us to answer the question of \"What is ethics?\" by way of Ludwig Wittgenstein's quote: \"The meaning of a word is its use in the language.\" We'll focus on times when people have used the word \"ethics\" to describe what they like or dislike about a specific technology.

          If you want to try it out for yourself, you should check out the game \"Something Something Soup Something.\" In this browser game, you are presented with a bunch of dishes and have to decide whether they are soup or not soup, as well as whether they can be served to somebody who ordered soup. By playing a game like this, you'll discover (1) how difficult it is to come up with a concrete definition of soup and (2) how poorly your working definition of soup fits with any given soup theory.

          Because of this case-based approach, we won't be talking about ethical schools or \"trolley\" problems. Rather than considering these hypothetical scenarios, we'll talk about concrete and specific examples from the past decade of work in our field and adjacent fields.

          If you want another point of view that emphasizes the trolley problems, you should check out Sergey's lecture from the last edition of the course from 2021. It presented similar ideas from a different perspective and came to the same conclusion and some different conclusions.

          A useful theme from that lecture that we should all have in mind when we ponder ethical dilemmas is \"What Is Water?\" - which came up from a famous commencement speech by David Foster Wallace. If we aren't thoughtful and paying attention, things that are very important can become background, assumptions, and invisible to us.

          The approach of relying on prominent cases risks replicating social biases. Some ethical claims are amplified and travel more because people (who are involved) have more resources and are better connected. Using these forms of case-based reasoning (where you explain your beliefs in concrete detail) can hide the principles that are actually in operation, making them disappear like water.

          But in the end, so much of ethics is deeply personal that we can't expect to have a perfect approach. We can just do the best we can and hopefully become better every day.

          "},{"location":"course/2022/lecture-9-ethics/#2-themes","title":"2 - Themes","text":"

          We'll see three themes repeatedly coming up throughout this lecture:

          1. Alignment: a conflict between what we want and what we get.

          2. Trade-Offs: a conflict between what we want and what others want.

          3. Humility: a response when we don't know what we want or how to get it.

          "},{"location":"course/2022/lecture-9-ethics/#alignment","title":"Alignment","text":"

          The problem of alignment (where what we want and what we get differ) come up over and over again. A primary driver of this is called the proxy problem - in which we often optimize or maximize some proxies for the thing that we really care about. If the alignment (or loosely the correlation between that proxy and the thing we care about) is poor enough, then by trying to maximize that proxy, we can end up hurting the thing we originally cared about.

          There was a recent paper that did a mathematical analysis of this idea. You can see these kinds of proxy problems everywhere once you look for them.

          • On the top right, we have a train and validation loss chart from one of the training runs for the FSDL text recognizer. The thing we can optimize is the training loss. That's what we can use to calculate gradients and improve the parameters of our network. But the thing we really care about is the performance of the network on data points that it has not seen (like the validation set, the test set, or data in production). If we optimize our training loss too much, we can actually cause our validation loss to go up.

          • Similarly, there was an interesting paper suggesting that increasing your accuracy on classification tasks can actually result in a decrease in the utility of your embeddings in downstream tasks.

          • You can find these proxy problems outside of ML as well. This thread reveals an example where a factory that was making chemical machines (rather than creating a machine that was cheaper and better) chose not to adopt producing that machine because their output was measured in weight. So the thing that the planners actually cared about, economic efficiency and output, was not optimized because it was too difficult to measure.

          One reason why these kinds of proxy problems arise so frequently is due to issues of information. The information that we are able to measure is not the information that we want. At a higher level, we often don't know what it is that we truly needed. We may want the validation loss, but what we need is the loss in production or really the value our users will derive from this model.

          "},{"location":"course/2022/lecture-9-ethics/#trade-offs","title":"Trade-Offs","text":"

          Even when we know what we want or what we need, we are likely to run into the second problem - the tradeoff between stakeholders. It is sometimes said that the need to negotiate tradeoffs is one of the reasons why engineers do not like thinking about some of these problems around ethics. That's not quite right because we do accept tradeoffs as a key component of engineering.

          • In this O'Reilly book on the fundamentals of software architecture, the first thing they state at the beginning is that everything in software architecture is a tradeoff.

          • This satirical O'Reilly book says that every programming question has the answer: \"It depends.\"

          The famous chart above compares the different convolutional networks on the basis of their accuracy and the number of operations to run them. Thinking about these tradeoffs between speed and correctness is exactly the thing we have to do all the time in our job as engineers.

          We can select the Pareto Front for the metrics we care about. A way to remember what a Pareto front is this definition of a data scientist from Josh Wills: \"Person who is better at statistics than any software engineer and better at software engineering than any statistician.\" The Pareto Front in the chart above includes the models that are more accurate than those with fewer FLOPs and use fewer FLOPs than those that are more accurate.

          A reason why engineers may dislike thinking about these problems is that it's hard to identify and quantify these tradeoffs. These are indeed proxy problems. Even further, once measured, where on that front do we fall? As engineers, we may develop expertise in knowing whether we want high accuracy or low latency, but we are not as comfortable deciding how many current orphans we want to trade for what amount of future health. This raises questions both in terms of measurement and decision-making that are outside of our expertise.

          "},{"location":"course/2022/lecture-9-ethics/#humility","title":"Humility","text":"

          The appropriate response is humility because most engineers do not explicitly train in these skills. Many engineers and managers in tech, in fact, constitutionally prefer optimizing single metrics that are not proxies. Therefore, when encountering a different kind of problem, it's important to bring a humble mindset, ask for help from experts, and recognize that the help you get might not be immediately obvious to what you are used to.

          Additionally, when intervening due to an ethical concern, it's important to remember this humility. It's easy to think that when you are on the good side, this humility is not necessary. But even trying to be helpful is a delicate and dangerous undertaking. We want to make sure that as we resolve ethical concerns, we come up with solutions that are not just parts of the problem.

          "},{"location":"course/2022/lecture-9-ethics/#user-orientation-undergirds-each-theme","title":"User Orientation Undergirds Each Theme","text":"

          We can resolve all of these via user orientation.

          1. By getting feedback from users, we maintain alignment between our system and our users.

          2. When making tradeoffs, we should resolve them in consultation with users.

          3. Humility means we actually listen to our users because we recognize we don't have the answers to all the questions.

          "},{"location":"course/2022/lecture-9-ethics/#3-tech-ethics","title":"3 - Tech Ethics","text":"

          The tech industry can't afford to ignore ethics as public trust in tech declines. We need to learn from other nearby industries that have done a better job on professional ethics. We'll also touch on some contemporary topics.

          "},{"location":"course/2022/lecture-9-ethics/#tech-industrys-ethical-crisis","title":"Tech Industry's Ethical Crisis","text":"

          Throughout the past decade, the tech industry has been plagued by scandal - whether that's how tech companies interface with national governments at the largest scale or how tech systems are being used or manipulated by people who create disinformation or fake social media accounts that hack the YouTube recommendation system.

          As a result, distrust in tech companies has risen markedly in the last ten years. This Public Affairs Pulse survey shows that in 2013, the tech industry was one of the industries with less trustworthiness on average. In 2021, it has rubbed elbows with famously more distrusted industries such as energy and pharmaceuticals.

          Politicians care quite a bit about public opinion polls. In the last few years, the fraction of people who believe that large tech companies should be more regulated has gone up a substantial amount. Comparing it to 10 years ago, it's astronomically higher. So there will be a substantial impact on the tech industry due to this loss of public trust.

          We can learn from nearby fields: from the culture of professional ethics in engineering in Canada (by wearing the Iron Ring) to ethical standards for human subjects research (Nuremberg Code, 1973 National Research Act). We are at the point where we need a professional code of ethics for software. Hopefully, many codes of ethics developed in different communities can compete with each other and merge into something that most of us can agree on. That can be incorporated into our education for new members of our field.

          Let's talk about two particular ethical concerns that arise in tech in general: carbon emissions and dark/user-hostile design patterns.

          "},{"location":"course/2022/lecture-9-ethics/#tracking-carbon-emissions","title":"Tracking Carbon Emissions","text":"

          Because carbon emissions scale with cost, you only need to worry about them when the costs of what you are working on are very large. Then you won't be alone in making these decisions and can move a bit more deliberately to make these choices more thoughtfully.

          Anthropogenic climate change from carbon emissions raises ethical concerns - tradeoffs between the present and future generations. The other view is that this is an issue that arises from a classic alignment problem: many organizations are trying to maximize their profit, which is based on prices for goods that don't include externalities (such as environmental damage caused by carbon emissions, leading to increased temperatures and lactic change).

          The primary dimension along which we have to worry about carbon emissions is in compute jobs that require power. That power can result in carbon emissions. This paper walks through how much carbon dioxide was emitted using typical US-based cloud infrastructure.

          • The top headline shows that training a large Transformer model with neural architecture search produces as much carbon dioxide as five cars create during their lifetimes.

          • It's important to remember that power is not free. On US-based cloud infrastructure, \\$10 of cloud spent is roughly equal to \\$1 of air travel costs. That's on the basis of something like the numbers and the chart indicating air travel across the US from New York to San Francisco.

          • Just changing cloud regions can actually reduce your emissions quite a bit. There's a factor of 50x from regions with the most to least carbon-intensive power generation.

          The interest in this problem has led to new tools. Codecarbon.io allows you to track power consumption and reduce carbon emissions from your computing. ML CO2 Impact is oriented directly towards machine learning.

          "},{"location":"course/2022/lecture-9-ethics/#deceptive-design-and-dark-patterns","title":"Deceptive Design and Dark Patterns","text":"

          The other ethical concern in tech is deceptive design. An unfortunate amount of deception is tolerated in some areas of software. As seen below, on the left is a nearly complete history of the way Google displays ads in its search engine results. It started off very clearly colored and separated out with bright colors from the rest of the results. Then about ten years ago, that colored background was removed and replaced with a tiny little colored snippet that said \"Ad.\" Now, as of 2020, that small bit is no longer even colored. It is just bolded. This makes it difficult for users to know which content is being served to them because somebody paid for it (versus content served up organically).

          A number of dark patterns of deceptive design have emerged over the last ten years. You can read about them on the website called deceptive.design. There's also a Twitter account called \\@darkpatterns that shares examples found in the wild.

          A practice in the tech industry that's on a very shaky ethical /legal ground is growth hacking. This entails a set of techniques for achieving rapid growth in user base or revenue for a product and has all the connotations you might expect from the name - with examples including LinkedIn and Hotmail.

          ML can actually make this problem worse if we optimize short-term metrics. These growth hacks and deceptive designs can often drive user and revenue growth in the short term but worsen user experience and draw down on goodwill towards the brand in a way that can erode the long-term value of customers. When we incorporate ML into the design of our products with A/B testing, we have to watch out to make sure that the metrics that we are optimizing do not encourage this kind of deception.

          These arise inside another alignment problem. One broadly-accepted justification for the private ownership of the means of production is that private enterprise delivers broad social value aligned by price signals and market focus. But these private enterprises optimize metrics that are, at best, a proxy for social value. There's the possibility of an alignment problem where companies pursuing and maximizing their market capitalization can lead to net negative production of value. If you spend time at the intersection of funding, leadership, and technology, you will encounter it.

          In the short term, you can push for longer-term thinking within your organization to allow for better alignment between metrics and goals and between goals and utility. You can also learn to recognize user-hostile designs and advocate for user-centered design instead.

          To wrap up this section on tech ethics:

          1. The tech industry should learn from other disciplines if it wants to avoid a trust crisis.

          2. We can start by educating ourselves about common deceptive or user-hostile practices in our industry.

          "},{"location":"course/2022/lecture-9-ethics/#4-ml-ethics","title":"4 - ML Ethics","text":"

          The ethical concerns raised about ML have gone beyond just the ethical questions about other kinds of technology. We'll talk about common ethical questions in ML and lessons learned from Medical ML.

          "},{"location":"course/2022/lecture-9-ethics/#why-not-just-tech-ethics","title":"Why Not Just Tech Ethics?","text":"

          ML touches human lives more intimately than other technologies. Many ML methods, especially deep neural networks, make human-legible data into computer-legible data. Humans are more sensitive to errors and have more opinions about visual and text data than they do about the type of data manipulated by computers. As a result, there are more stakeholders with more concerns that need to be traded off in ML applications.

          Broadly speaking, ML involves being wrong pretty much all the time. Our models are statistical and include \"randomness.\" Randomness is almost always an admission of ignorance. As we admit a certain degree of ignorance in our models, our models will be wrong and misunderstand situations that they are put into. It can be upsetting and even harmful to be misunderstood by our models.

          Against this backlash of greater interest or higher stakes, a number of common types of ethical concerns have coalesced in the last couple of years. There are somewhat established camps of answers to these questions, so you should at least know where you stand on the four core questions:

          1. Is the model \"fair\"?

          2. Is the system accountable?

          3. Who owns the data?

          4. Should the system be built at all?

          "},{"location":"course/2022/lecture-9-ethics/#common-ethical-questions-in-ml","title":"Common Ethical Questions in ML","text":""},{"location":"course/2022/lecture-9-ethics/#is-the-model-fair","title":"Is The Model \"Fair\"?","text":"

          The classic case on this comes from criminal justice with the COMPAS system for predicting whether a defendant will be arrested again before trial. If they are arrested again, that suggests they committed a crime during that time. This assesses a certain degree of risk for additional harm while the justice system decides what to do about a previous arrest and potential crime.

          The operationalization here was a 10-point re-arrest probability based on past data about this person, and they set a goal from the very beginning to be less biased than human judges. They operationalize that by calibrating these arrest probabilities across subgroups. Racial bias is a primary concern in the US criminal justice system, so they took care to make sure that these probabilities of re-arrest were calibrated for all racial groups.

          The system was deployed and used all around the US. It's proprietary and difficult to analyze. But using the Freedom of Information Act and coalescing together a bunch of records, people at ProPublica were able to run their own analysis of this algorithm. They determined that the model was not more or less wrong for one racial group or another. It tended to have more false positives for Black defendants and more false negatives for White defendants. So despite the creators of COMPAS taking into account bias from the beginning, they still ended up with an algorithm with this undesirable property of being more likely to falsely accuse Black defendants than White defendants.

          It turned out that some quick algebra revealed that some form of race-based bias is inevitable in this setting, as indicated in this paper. There are a large number of fairness definitions that are mutually incompatible. This tutorial by Arvind Narayanan is an excellent one to display them.

          It is noteworthy that the impact of \"unfairness\" is not fixed. The story is often presented as \"no matter what, the journalists would have found something to complain about.\" But note that equalizing false positive rates and positive predictive value across groups would lead to a higher false negative rate for Black defendants relative to White defendants. In the context of American politics, that's not going to lead to complaints from the same people.

          This is the story about the necessity of confronting the tradeoffs that will inevitably come up. Researchers at Google made a nice little tool where you can think through and make these tradeoffs for yourself. It's helpful for building intuition on these fairness metrics and what it means to pick one over the other.

          Events in this controversy kicked off a flurry of research on fairness. The Fairness, Accountability, and Transparency conference has been held for several years. There has been a ton of work on both algorithmic-level approaches on measuring and incorporating fairness metrics into training and qualitative work on designing systems that are more transparent and accountable.

          In the case of COMPAS, re-arrest is not the same as recidivism. Being rearrested requires that a police officer believes you committed a crime. Police officers are subject to their own biases and patterns of policing, which result in a far higher fraction of crimes being caught for some groups than for others. Our real goal, in terms of fairness and criminal justice, might be around reducing those kinds of unfair impacts and using past rearrest data that have these issues.

          "},{"location":"course/2022/lecture-9-ethics/#representation-matters-for-model-fairness","title":"Representation Matters for Model Fairness","text":"

          Unfortunately, it is easy to make ML-powered tech that fails for minoritized groups. For example, off-the-shelf computer vision tools often fail on darker sins (as illustrated in this talk by Joy Buolamwini). This is not a new issue in technology, just a more salient one with ML.

          There has been a good amount of progress on this in the last five years. An example is Google's Model Cards which show how well a model will perform on human subgroups of interest. HuggingFace has good integrations for creating these kinds of model cards.

          When you invite people for talks or hire people to join your organizations, you should work to reduce the bias of that discovery process by diversifying your network. Some good resources include Black in AI, Diversify Tech Job Board, Women in Data Science, and the You Belong in AI podcast. You can make professional connections via them to improve the representation of minoritized groups in the engineering, design, and product management process.

          "},{"location":"course/2022/lecture-9-ethics/#is-the-system-accountable","title":"Is The System Accountable?","text":"

          At a broader level than fairness, we should expect \"accountability\" from ML systems. Some societies and states, including the EU, consider \"the right to an explanation\" in the face of important judgments to be a part of human rights.

          In the GDPR act, there is a section that enshrines accountability. This isn't quite a totally new requirement; credit denials in the US have been required to be explained since 1974. People have a right to know what and why into making decisions for them!

          If you want to impose this \"accountability\" on a deep neural network and understand its selections, there are a number of methods that use the input-output gradient to explain the model. You can see a list of several methods in order of increasing performance below (from this paper). These approaches don't quite have strong theoretical underpinnings or a holistic explanation, and are not that robust as a result. A lot of these methods act primarily as edge detectors. The paper shows how even randomizing layers in a model does not materially change the interpretability output of GradCAM methods.

          As a result, introspecting DNNs effectively requires reverse engineering the system to really understand what is going on, largely thanks to efforts like Distil and Transfomer Circuits.

          Due to these technical challenges, machine learning systems are prone to unaccountability that impacts most those least able to understand and influence their outputs. Books such as Automating Inequality describe the impacts of these systems. In such a context, you should seek to question the purpose of model, involve those impacted by the decisions (either through direct human inputs or through other means), and ensure that equal attention is paid to benefits and harms of automation.

          "},{"location":"course/2022/lecture-9-ethics/#who-owns-the-data","title":"Who Owns The Data?","text":"

          Humans justifiably feel ownership of the data they create, which is subsequently used to train machine learning models. Large datasets used to train models like GPT-3 are created by mining this data without the explicit involvement of those who create the data. Many people are not aware that this is both possible and legal. As technology has changed, what can be done with data has changed.

          You can even verify if your data has been used to train models on. Some of these images are potentially obtained illegally, as a result of sensitive data being posted openly without the recorded consent of the originator.

          Each of these controversies around image generation and illegal data has opened up a new frontier in data governance. Focus will be placed on ensuring new ML systems are sensitive to personal and professional concerns of those who generate the data ML systems are trained on. Emad Mostaque, CEO of Stability AI, has gone so far as to offer future opt out systems from systems similar to Stable Diffusion.

          Here are some practical tips: Dataset cards can be helpful in providing documentation in a similar fashion to model cards. There are also ethics lists, like the deon ethic checklist that helps design proper systems. Deon also has a helpful list of failure cases.

          "},{"location":"course/2022/lecture-9-ethics/#should-this-be-built-at-all","title":"Should This Be Built At All?","text":"

          The undercurrent behind this conversation is the justifiable question of whether some of these systems should be built at all, let alone in an ethical way.

          ML-powered weaponry is the canonical example here, which is already in use. The definition of these systems are blurry, as both systems old and new have had various autonomous capacities. This is difficult to get a sense of due to the secrecy associated with weapon systems.

          Some have argued that \"autonomous weapons\" have existed for hundreds of years, but even this does not mean that they are ethical. Mines are good examples of these systems. Movements like the Campaign Against Killer Robots are trying to prevent the cycle we entered with mines - where we invented them, when we realized the incredible harm, and why we are trying to ban them. Why invent these at all?

          Let's wrap up this entire section with some closing questions that you should always have a thoughtful answer to as you build a machine learning system.

          1. Is the model \"fair\"? Fairness is possible, but requires trade-offs.

          2. Is the system accountable? Accountability is easier than interpretability.

          3. Who owns the data? Answer this upfront. Changes are on the way.

          4. Should the system be built at all? Repeatedly ask this and use it to narrow scope.

          "},{"location":"course/2022/lecture-9-ethics/#what-can-we-learn-from-medical-ml","title":"What Can We Learn from Medical ML","text":"

          Note: The FSDL team would like to thank Dr. Amir Ashraf Ganjouei for his feedback on this section.

          Interestingly, medicine can teach us a lot about how to apply machine learning in a responsible way. Fundamentally, this has led to a mismatch between how medicine works and how machine learning systems are built today.

          Let's start with a startling fact: the machine learning response to COVID-19 was an abject failure. In contrast, the biomedical response was a major triumph. For example, the vaccines were developed with tremendous speed and precision.

          Machine learning did not acquit itself well with the COVID-19 problem. Two reviews (Roberts et al., 2021 and Wynants et al., 2020-2022) found that nearly all machine learning models were insufficiently documented, had little to no external validation, and did not follow model development best practices. A full 25% of the papers used a dataset incorrect for the task, which simply highlighted the difference between children and adults, not pneumonia and COVID.

          Medicine has a strong culture of ethics that professionals are integrated into from the point they start training. Medical professionals take the Hippocratic oath of practicing two things: either help or do not harm the patient. In contrast, the foremost belief associated with software development tends to be the infamous \"Move fast and break things.\" While this approach works for harmless software like web apps, it has serious implications for medicine and other more critical sectors. Consider the example of a retinal implant that was simply deprecated by developers and left hundreds without sight in this Statnews article.

          Researchers are drawing inspiration from medicine to develop similar standards for ML.

          • For example, clinical trial standards have been extended to ML. These standards were developed through extensive surveys, conferences, and consensus building (detailed in these papers).

          • Progress is being made in understanding how this problem presents. A recent study found that while clinical activities are generally performed at a high compliance level, statistical and data issues tend to suffer low compliance.

          • New approaches are developing entire \"auditing\" procedures that exquisitely identify the activities required to effectively develop models.

          Like medicine, machine learning is intimately intertwined with people's lives. The most important question to ask is \"Should this system be built at all?\". Always ask yourselves this and understand the implications!

          "},{"location":"course/2022/lecture-9-ethics/#5-ai-ethics","title":"5 - AI Ethics","text":"

          AI ethics are a frontier in both the technology and the ethics worlds. False claims and hype are the most pressing concerns, but other risks could present themselves soon.

          "},{"location":"course/2022/lecture-9-ethics/#ai-snake-oils","title":"AI Snake Oils","text":"

          False claims outpace the performance of AI. This poses a serious threat to adoption and satisfaction with AI systems long term.

          • For example, if you call something \"AutoPilot\", people might truly assume it is fully autonomous, as happened in the below case of a Tesla user. This goes back to our discussion about how AI systems are more like funky dogs than truly human intelligent systems.

          • Another example of this is IBM's Watson system, which went from tackling the future of healthcare to being sold off for parts.

          These false claims tend to be amplified in the media. But this isn't confined to traditional media. Even Geoff Hinton, a godfather of modern machine learning, has been a little too aggressive in his forecasts for AI performance!

          You can call this \"AI Snake Oil\" as Arvind Narayanan does in his Substack and talk.

          Let's separate out where true progress has been made versus where progress is likely to be overstated. On some level, AI perception has seen tremendous progress, AI judgment has seen moderate progress, and AI prediction of social outcomes has seen not nearly as much progress.

          "},{"location":"course/2022/lecture-9-ethics/#frontiers-ai-rights-and-x-risk","title":"Frontiers: AI Rights and X-Risk","text":"

          There's obvious rationale that should artificial sentient beings exist, tremendous ethical implications would be raised. Few people believe that we are truly on the precipice of sentient beings, but there is disagreement on how close we are.

          There's a different set of concerns around how to regard self-improving intelligent beings, for which there is already evidence. Large Language Models have been show to be able to improve themselves in a range of studies (here and here).

          Failing to pursue this technology would lead to a huge opportunity cost (as argued by Nick Bostrom)! There truly is a great opportunity in having such systems help us sold major problems and lead better lives. The key though, is that such technology should be developed in the safest way possible, not the fastest way.

          The paperclip problem shows how the potential for misalignment between AI systems and humans could dramatically reduce human utility and even compromise our interests. Imagine a system designed to manufacture paperclips... could actually develop the intelligence to alter elements of society to favor paper clips?! This thought experiments illustrates how self-learning systems could truly change our world for the worse in a misaligned way.

          These ideas around existential risk are most associated with the Effective Altruism community. Check out resources like Giving What We Can and 80,000 Hours if you're interested!

          "},{"location":"course/2022/lecture-9-ethics/#6-what-is-to-be-done","title":"6 - What Is To Be Done?","text":"

          This course can't end on a dour a note as existential risk. What can be done to mitigate these consequences and participate in developing truly ethical AI?

          1. The first step is to educate yourself on the topic. There are many great books that give lengthy, useful treatment to this topic. We recommend Automating Inequality, Weapons of Math Destruction, and The Alignment Problem.

          2. After reading this, consider how to prioritize your actions. What do you want to impact? When do you want to do that? Place them in this two-by-two to get a sense of where their importance is.

          Ethics cannot be purely negative. We do good, and we want to prevent bad! Focus on the good you can do and be mindful of the harm you can prevent.

          Leading organizations like DeepMind and OpenAI are leading from the front. Fundamentally, building ML well aligns with building ML for good. All the leading organizations emphasize effective and responsible best practices for building ML powered practices. Keep all this in mind as you make the world a better place with your AI-powered products!

          "},{"location":"course/2022/project-showcase/","title":"Project Showcase","text":"

          Students who registered for the synchronous version of the course formed teams and worked on their own deep learning-powered products.

          Whether you're looking for your next startup idea or deciding how to improve your portfolio, we hope these projects inspire you to build something real with DNNs!

          Info

          Many of these projects were made possible thanks to a generous donation of GPU-accelerated compute infrastructure by LambdaLabs. Check them out if you're looking for on-prem or cloud GPU machines!

          If you're interested in working on full stack projects, join us on Discord and post/ask around about group project work.

          "},{"location":"course/2022/project-showcase/#course-co-pilot","title":"Course Co-Pilot","text":"

          An ML powered application for streamlining the process of creating chapter markers and lesson summaries for course content creators.

          Team: Kurian Benoy, Wayde Gilliam, Suvash Thapaliya Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#admirer","title":"Admirer","text":"

          A full-stack ML-powered website that utilizes users\u2019 webcam feeds to answer open-ended questions requiring outside knowledge.

          Team: Andrew Hinh Live Demo.

          "},{"location":"course/2022/project-showcase/#green-screen-image-composition-transfer","title":"Green-Screen Image Composition-Transfer","text":"

          An ML-powered app for adding (optionally Stable Diffusion-generated) virtual backgrounds to images that uses style transfer to match lighting anad composition.

          Team: Nitin Kishore Sai Samala Live Demo. Poster.

          "},{"location":"course/2022/project-showcase/#weak-supervision-and-active-learning-with-text-data","title":"Weak Supervision and Active Learning with Text Data","text":"

          An approach to minimise human labelling for text classification tasks.

          Team: Aleks Hiidenhovi, Bernardo Garc\u00eda, Diego Quintana, Edoardo Abati, Juan Manuel, Kushal Ramaiya GitHub Repo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#x-ray-diagnosis-ai-assistant","title":"X-Ray Diagnosis AI Assistant","text":"

          An interface to support medical practitioners in diagnosing and interpreting x-ray images.

          Team: Arun Hegde, Samarth Keshari, Amulya Badal, Ross Cheung, Seyhan Karakulak GitHub Repo.

          "},{"location":"course/2022/project-showcase/#moms-ai-food-logger","title":"Mom's AI Food Logger","text":"

          An app for my mom that automatically identifies and tracks the food she eats.

          Team: Prince Javier Live Demo.

          "},{"location":"course/2022/project-showcase/#archaeological-feature-detector","title":"Archaeological Feature Detector","text":"

          A prototype web app to help archaeologists interpret automatically detected objects as part of a machine-learning-powered survey workflow.

          Team: Philanoe, jmmoreu, Kemp, lakillo Slide Deck.

          "},{"location":"course/2022/project-showcase/#semantic-search-engine-for-images","title":"Semantic Search Engine for Images","text":"

          A semantic text search engine over images, along with monitoring.

          Team: Sandhya Govindaraju, Utkarsh Vardhan, Gabriella Chiribau, Amit Kumar Sah Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#recipewise","title":"Recipewise","text":"

          An image to recipe food classifier.

          Team: Carlo David, Chavo Kim, George Loh, Nari Jeong, and Rina Buoy Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#imagein","title":"ImageIN","text":"

          A pragmatic approach to identifying illustrated pages in digitised historic books.

          Team: Frank Smitskamp, Zacharie Bouhnik, Daniel van Strien Live Demo. GitHub Repo.

          "},{"location":"course/2022/project-showcase/#full-stack-stable-diffusion","title":"Full Stack Stable Diffusion","text":"

          A deployment of Stable Diffusion Text-to-Image and Image-to-Image pipelines with a full stack architecture.

          Team: Okan Ulusoy and Omid Abdollahi Aghdam GitHub Repo.

          "},{"location":"course/2022/project-showcase/#multimodal-fusion-models-for-healthcare","title":"Multimodal Fusion Models for Healthcare","text":"

          An architecture for using multiple modalities of healthcare data to train deep learning models.

          Team: Vinod Nair, Khoa Nguyen, Carlos Leyson, Kieran Didi, Sridhar Iyer, Alan Chang GitHub Repo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#fiberdiametercom","title":"FiberDiameter.com","text":"

          Measure the diameter of nanofibers in microscopy images.

          Team: @f_cossio, @yael_su, @__coszio, @aledelunap Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#geolocator","title":"GeoLocator","text":"

          An app that guesses the location of an image, video, or video url.

          Team: David Hrachovy, Samhita Alla, Yiyi Xu, Winson Truong Live Demo. GitHub Repo.

          "},{"location":"course/2022/project-showcase/#image-anonymiser","title":"\ud83d\udc7b Image Anonymiser","text":"

          An ML-powered image anonymisation web app.

          Team: Sami Saadaoui, Vladislav Vancak, Lawrence Francis, Dan Howarth, Allan Stevenson GitHub Repo. Project Page.

          "},{"location":"course/2022/project-showcase/#buggingspace","title":"BuggingSpace","text":"

          An interface for red-teaming open source text generation models from the Hugging Face hub.

          Team: Sashank Pisupati, Sajenthan Vigneswaran, Kemp Bray, Jean-Antoine Zagato Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#board-game-rules-explainer","title":"Board Game Rules Explainer","text":"

          A board game question-answering system to save players from having to check the rulebook.

          Team: Rafal Wojcik, Tim Jones, Issam Hammi, Muriel Hol Live Demo. GitHub Repo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#gesto-ai-asl-word-recognizer","title":"Gesto AI - ASL Word Recognizer","text":"

          A real-time, word-level American Sign Language translation app.

          Team: Shivam Arora, Daniel Firebanks-Quevedo, Pablo Oberhauser, Dhruv Shah, Ibrahim Sherif, Samuel Tang Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#choosistant","title":"choosistant","text":"

          Choosistant helps you decide which product to buy by summarizing pros and cons from written reviews.

          Team: Kimotho, Murad Khalilov, Nam, Omar Ali Sheikh, Sofiane Chami Project Page.

          "},{"location":"course/2022/project-showcase/#semantic-search-sentiment-analysis","title":"Semantic Search & Sentiment Analysis","text":"

          Upload a PDF or text document and enable semantic QA and sentiment analysis.

          Team: Sam Tonetto, Francisco Perez-Sorrosal, Navaneeth Tirupathi, Alexander Chia, Priyam Sadhukhan Project Repo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#in-browser-ai","title":"In-Browser AI","text":"

          Run modern neural networks directly in your browser from a computer or phone.

          Team: Alexander Visheratin Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#audio-vtuber","title":"Audio VTuber","text":"

          Animate a cartoon with facial expressions using only your voice.

          Team: Alex Service, Moon Ma Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#ocr-semsearch","title":"OCR SemSearch","text":"

          OCR SemSearch allows you to perform semantic search on text within images from different types of documents.

          Team: Sebastian Gonzalez Aseretto, Paramtap Mewada Project Poster.

          "},{"location":"course/2022/project-showcase/#live-art-in-context","title":"Live Art in Context","text":"

          Draw on the creative power of modern ML models to create art responsive to events in text or video streams.

          Team: David Murphy, Angel Carvajal, Theresa Thoraldson, Chris Lonsberry Slide Deck.

          "},{"location":"course/2022/project-showcase/#plant-id","title":"Plant ID","text":"

          A plant species identifier available as a web app and as a cross-platform mobile app.

          Team: Ben Michel, Navid Matin Moghaddam, Stone Jiang, Shishir Valalla, Vibudh Singh Live Demo. Slide Deck.

          "},{"location":"course/2022/project-showcase/#landcover-segmentation","title":"Landcover Segmentation","text":"

          A data product for multi-class semantic segmentation of earth observation images using a UNet architecture. Team: Suzana, Roland Ritt, Sheebo

          Slide Deck.

          "},{"location":"course/2022/project-showcase/#meeting-graph-generator","title":"Meeting Graph Generator","text":"

          Team: Soroush Bassam Slide Deck. GitHub Repo.

          "},{"location":"llm-bootcamp/","title":"Full Stack LLM Bootcamp","text":"

          \ud83d\ude80 Full Stack LLM Bootcamp \ud83d\ude80

          • Learn best practices and tools for building LLM-powered apps
          • Cover the full stack from prompt engineering to user-centered design
          • Get up to speed on the state-of-the-art
          • Access the materials!"},{"location":"llm-bootcamp/#testimonials","title":"Testimonials","text":""},{"location":"llm-bootcamp/#why","title":"Why","text":"

            The way AI-powered apps are built has changed:

            • Before LLMs, an idea would bottleneck on training models from scratch, and then it'd bottleneck again on scalable deployment.
            • Now, a compelling MVP based on pretrained LLM models and APIs can be configured and serving users in an hour.

            An entirely new ecosystem of techniques, tools, and tool vendors is forming around LLMs. Even ML veterans are scrambling to orient themselves to what is now possible and figure out the most productive techniques and tools.

            "},{"location":"llm-bootcamp/#what","title":"What","text":"

            We put together a two-day program based on emerging best practices and the latest research results to help you make the transition to building LLM apps with confidence.

            We ran that program as an in-person bootcamp in San Francisco in April 2023. Now, we're releasing the recorded lectures, for free!

            • \u2728 Learn to Spell: Prompt Engineering and Other Magic
            • \ud83c\udfce\ufe0f LLMOps: Deployment and Learning in Production
            • \ud83e\udd37 UX for Language User Interfaces
            • \ud83d\udd28 Augmented Language Models
            • \ud83d\ude80 Launch an LLM App in One Hour
            • \ud83d\udd2e What's Next?
            • \ud83d\uddff LLM Foundations
            • \ud83d\udc77\u200d\u2642\ufe0f askFSDL Walkthrough

            What do I need to know already?

            The lectures aim to get anyone with experience programming in Python ready to start building applications that use LLMs.

            Experience with at least one of machine learning, frontend, or backend will be very helpful.

            "},{"location":"llm-bootcamp/#who","title":"Who","text":"

            We are Full Stack Deep Learning. We're a team of UC Berkeley PhD alumni with years of industry experience who are passionate about teaching people how to make deep neural networks work in the real world.

            Since 2018, we have taught in-person bootcamps, online multi-week cohorts, and official semester-long courses at top universities.

            "},{"location":"llm-bootcamp/#instructor-team","title":"Instructor Team","text":"Charles Frye educates people in AI. He has worked on AI/ML tooling with Weights & Biases and Gantry since getting a PhD in Theoretical Neuroscience at UC Berkeley. Sergey Karayev builds AI-powered products as Co-founder of Volition. He co-founded Gradescope after getting a PhD in AI at UC Berkeley. Josh Tobin builds tooling for AI products as Co-founder and CEO of Gantry. He worked as a Research Scientist at OpenAI and received a PhD in AI at UC Berkeley."},{"location":"llm-bootcamp/#_1","title":"LLM Bootcamp","text":"

            If you have any questions about the bootcamp materials, contact admin @ fullstackdeeplearning.com.

            "},{"location":"llm-bootcamp/expense/","title":"How do I expense the bootcamp?","text":""},{"location":"llm-bootcamp/expense/#submitting-a-receipt","title":"Submitting a receipt","text":"

            Upon registration, you'll receive an email from our event provider with your ticket and your order details, including the last 4 digits of the credit card you used, if you used one.

            This is often sufficient documentation.

            "},{"location":"llm-bootcamp/expense/#verification-of-attendance","title":"Verification of attendance","text":"

            If you need your attendance verified, please email us at admin@fullstackdeeplearning.com after the event is finished and indicate the address to which we should send a verification email.

            "},{"location":"llm-bootcamp/expense/#requesting-approval","title":"Requesting approval","text":"

            Some organizations require prior approval for education expenses from a manager.

            In that case, you can work off of the email template below. We suggest you customize it to your team and its needs.

            Keep in mind that you want to make sure that it's clear to your manager why sending you to this bootcamp is in the company's interest, not just yours!

            An email template, co-written with ChatGPT

            Dear {manager},

            I wanted to bring to your attention a bootcamp that I would love to enroll in. Full Stack Deep Learning is hosting a two-day program that covers the emerging field of application development with Large Language Models (LLMs). FSDL is a respected name that has delivered educational material on productionizing machine learning in formats from large MOOCs to courses at top universities like UC Berkeley and UW.

            This bootcamp will provide me with the knowledge and skills necessary to build and deploy LLM applications and stay up-to-date with the state-of-the-art in the industry.

            Some of the benefits:

            • Coverage of both conceptual fundamentals and concrete engineering practices
            • Talks from leaders in the field, like Harrison Chase of LangChain
            • Opportunities to network with builders

            I expect to be able to bring back my learnings and apply them directly to my work and share them with the team.

            The cost of the bootcamp is {price}. I strongly believe that this investment in my education will benefit the company in the long run.

            If you are interested, you can find more details about the bootcamp, including the instructors and their bios, at the following link: https://fsdl.me/2023-llmbc-landing.

            Thank you for considering my request.

            Best,

            {name}

            "},{"location":"llm-bootcamp/sponsors/","title":"Info for Sponsors","text":"

            We offer three tiers of sponsorship for the FSDL 2023 LLM Bootcamp:

            1. Vector Tier sponsors receive the following benefits: logo displayed on website and during conference, verbal acknowledgement. Vector Tier sponsorships are available for $1500.

            2. Matrix Tier sponsors receive all the benefits of the Vector Tier, plus: logo displayed in between talks, 6' table with two seats, and a conference registration. Matrix Tier sponsorships are available for $4500.

            3. Tensor Tier sponsors receive all the benefits of the Matrix Tier, plus: logo displayed on a banner in the registration area, access to an opt-in database of attendees with information about job and job-seeking status, and two additional registrations (for a total of three). Tensor Tier sponsorships are available for $10000.

            Contact sponsorships@fullstackdeeplearning.com if you're interested in sponsoring the conference!

            "},{"location":"llm-bootcamp/spring-2023/","title":"LLM Bootcamp - Spring 2023","text":"

            What are the pre-requisites for this bootcamp?

            Our goal is to get you 100% caught up to state-of-the-art and ready to build and deploy LLM apps, no matter what your level of experience with machine learning is.

            Please enjoy, and email us, tweet us, or post in our Discord if you have any questions or feedback!

            "},{"location":"llm-bootcamp/spring-2023/#lectures","title":"Lectures","text":""},{"location":"llm-bootcamp/spring-2023/#learn-to-spell-prompt-engineering","title":"Learn to Spell: Prompt Engineering","text":"
            • High-level intuitions for prompting
            • Tips and tricks for effective prompting: decomposition/chain-of-thought, self-criticism, ensembling
            • Gotchas: \"few-shot learning\" and tokenization
            "},{"location":"llm-bootcamp/spring-2023/#llmops","title":"LLMOps","text":"
            • Comparing and evaluating open source and proprietary models
            • Iteration and prompt management
            • Applying test-driven-development and continuous integration to LLMs
            "},{"location":"llm-bootcamp/spring-2023/#ux-for-language-user-interfaces","title":"UX for Language User Interfaces","text":"
            • General principles for user-centered design
            • Emerging patterns in UX design for LUIs
            • UX case studies: GitHub Copilot and Bing Chat"},{"location":"llm-bootcamp/spring-2023/#augmented-language-models","title":"Augmented Language Models","text":"
              • Augmenting language model inputs with external knowledge
              • Vector indices and embedding management systems
              • Augmenting language model outputs with external tools"},{"location":"llm-bootcamp/spring-2023/#launch-an-llm-app-in-one-hour","title":"Launch an LLM App in One Hour","text":"
                • Why is now the right time to build?
                • Techniques and tools for the tinkering and discovery phase: ChatGPT, LangChain, Colab
                • A simple stack for quickly launching augmented LLM applications"},{"location":"llm-bootcamp/spring-2023/#llm-foundations","title":"LLM Foundations","text":"
                  • Speed-run of ML fundamentals
                  • The Transformer architecture
                  • Notable LLMs and their datasets"},{"location":"llm-bootcamp/spring-2023/#project-walkthrough-askfsdl","title":"Project Walkthrough: askFSDL","text":"
                      • Walkthrough of a GitHub repo for sourced Q&A with LLMs
                      • Try it out via a bot in our Discord
                      • Python project tooling, ETL/data processing, deployment on Modal, and monitoring with Gantry
                    "},{"location":"llm-bootcamp/spring-2023/#whats-next","title":"What's Next?","text":"
                    • Can we build general purpose robots using multimodal models?
                    • Will models get bigger or smaller? Are we running out of data?
                    • How close are we to AGI? Can we make it safe?"},{"location":"llm-bootcamp/spring-2023/#invited-talks","title":"Invited Talks","text":""},{"location":"llm-bootcamp/spring-2023/#reza-shabani-how-to-train-your-own-llm","title":"Reza Shabani: How To Train Your Own LLM","text":"
                      • The \"Modern LLM Stack\": Databricks, Hugging Face, MosaicML, and more
                      • The importance of knowing your data and designing preprocessing carefully
                      • The features of a good LLM engineer
                      • By Reza Shabani, who trained Replit's code completion model, Ghostwriter.
                      "},{"location":"llm-bootcamp/spring-2023/#harrison-chase-agents","title":"Harrison Chase: Agents","text":"
                      • The \"agent\" design pattern: tool use, memory, reflection, and goals
                      • Challenges facing agents in production: controlling tool use, parsing outputs, handling large contexts, and more
                      • Exciting research projects with agents: AutoGPT, BabyAGI, CAMEL, and Generative Agents
                      • By Harrison Chase, co-creator of LangChain
                      "},{"location":"llm-bootcamp/spring-2023/#fireside-chat-with-peter-welinder","title":"Fireside Chat with Peter Welinder","text":"
                      • With Peter Welinder, VP of Product & Partnerships at OpenAI
                      • How OpenAI converged on LLMs
                      • Learnings and surprises from releasing ChatGPT
                      "},{"location":"llm-bootcamp/spring-2023/#sponsors","title":"Sponsors","text":"

                      We are deeply grateful to all of the sponsors who helped make this event happen.

                      "},{"location":"llm-bootcamp/spring-2023/#direct-sponsors","title":"Direct Sponsors","text":""},{"location":"llm-bootcamp/spring-2023/#compute-credit-sponsors","title":"Compute Credit Sponsors","text":""},{"location":"llm-bootcamp/spring-2023/expense/","title":"How do I expense the bootcamp?","text":""},{"location":"llm-bootcamp/spring-2023/expense/#submitting-a-receipt","title":"Submitting a receipt","text":"

                      Upon registration, you'll receive an email from our event provider with your ticket and your order details, including the last 4 digits of the credit card you used, if you used one.

                      This is often sufficient documentation.

                      "},{"location":"llm-bootcamp/spring-2023/expense/#verification-of-attendance","title":"Verification of attendance","text":"

                      If you need your attendance verified, please email us at admin@fullstackdeeplearning.com after the event is finished and indicate the address to which we should send a verification email.

                      "},{"location":"llm-bootcamp/spring-2023/expense/#requesting-approval","title":"Requesting approval","text":"

                      Some organizations require prior approval for education expenses from a manager.

                      In that case, you can work off of the email template below. We suggest you customize it to your team and its needs.

                      Keep in mind that you want to make sure that it's clear to your manager why sending you to this bootcamp is in the company's interest, not just yours!

                      An email template, co-written with ChatGPT

                      Dear {manager},

                      I wanted to bring to your attention a bootcamp that I would love to enroll in. Full Stack Deep Learning is hosting a two-day program that covers the emerging field of application development with Large Language Models (LLMs). FSDL is a respected name that has delivered educational material on productionizing machine learning in formats from large MOOCs to courses at top universities like UC Berkeley and UW.

                      This bootcamp will provide me with the knowledge and skills necessary to build and deploy LLM applications and stay up-to-date with the state-of-the-art in the industry.

                      Some of the benefits:

                      • Coverage of both conceptual fundamentals and concrete engineering practices
                      • Talks from leaders in the field, like Harrison Chase of LangChain
                      • Opportunities to network with builders

                      I expect to be able to bring back my learnings and apply them directly to my work and share them with the team.

                      The cost of the bootcamp is {price}. I strongly believe that this investment in my education will benefit the company in the long run.

                      If you are interested, you can find more details about the bootcamp, including the instructors and their bios, at the following link: https://fsdl.me/2023-llmbc-landing.

                      Thank you for considering my request.

                      Best,

                      {name}

                      "},{"location":"llm-bootcamp/spring-2023/sponsors/","title":"Info for Sponsors","text":"

                      We offer three tiers of sponsorship for the FSDL 2023 LLM Bootcamp:

                      1. Vector Tier sponsors receive the following benefits: logo displayed on website and during conference, verbal acknowledgement. Vector Tier sponsorships are available for $1500.

                      2. Matrix Tier sponsors receive all the benefits of the Vector Tier, plus: logo displayed in between talks, 6' table with two seats, and a conference registration. Matrix Tier sponsorships are available for $4500.

                      3. Tensor Tier sponsors receive all the benefits of the Matrix Tier, plus: logo displayed on a banner in the registration area, access to an opt-in database of attendees with information about job and job-seeking status, and two additional registrations (for a total of three). Tensor Tier sponsorships are available for $10000.

                      Contact sponsorships@fullstackdeeplearning.com if you're interested in sponsoring the conference!

                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/","title":"Project Walkthrough: askFSDL","text":"

                      Project by Charles Frye. Published May 9, 2023.

                      View the project repository.

                      Interact with the bot on our Discord.

                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/#swe-tooling-make-precommit-etc","title":"SWE Tooling: make, precommit, etc","text":"
                      • Walked everyone through the code base for the Discord bot they interacted with
                      • Sourced question-answering over a corpus of information using Vector storage for retrieval
                      • GitHub repo available for this project, but may not be able to execute the code without accounts on all services
                      • Makefile created for easier project management, setting up environment and authentication, and running setup commands
                      • Incorporated software tools like pre-commit checks, black for Python auto-formatting, and rust-powered formatter
                      • Shell check tool useful for catching issues in bash scripts
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/#data-cleaning","title":"Data Cleaning","text":"
                      • Initial approach of scraping data and chunking into smaller pieces did not yield good results
                      • Improved results by spending time understanding the data and preserving the structure during processing
                      • Extracting textual information from other sources like images and YouTube videos can enhance the usefulness of language models
                      • Sometimes simple solutions to specific data sources and problems can greatly improve the quality of results
                      • The unglamorous work of getting to know the data and writing code to manage it properly can result in big dividends for language model applications
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/#infrastructure-modal","title":"Infrastructure: Modal","text":"
                      • Discussed the ETL component of extracting, transforming, and loading data from various sources
                      • Discussed using Python packages for data transformation and addressing dependency issues with tools like pre-commit
                      • Explained the benefits of the modal component in creating lightweight virtual containers for different tasks
                      • Modal containers are fast and efficient, aiding in quick development cycles and allowing for containerization without the pains of traditional Docker images
                      • Modal also allows for the creation of serverless applications with auto-scaling and resource management
                      • Debugging and local development can be done through the interactive mode by connecting to a container running on modal
                      • showModal provides an interface for tracking application activity, utilization, and resource allocation, making it a versatile tool for various projects
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/#frontend-gradio-discord","title":"Frontend: Gradio & Discord","text":"
                      • Introduced Gradio user interface, allowing users to create interfaces in pure Python
                      • Gradio UI is flexible, supported by Hugging Face, and rapidly adopting machine learning features
                      • Examples of Gradio UI use include Alpaca, Flamingo, and Dolly mini
                      • Gradio UI is easy to set up, portable, flexible, and comes with an API with OpenAPI spec
                      • Discord bot integrated with Python library Discord.py; alternative library Interactions.py is also available
                      • Gradio UI is built on FastAPI for asynchronous Python web service
                      • Application mainly runs on the model's infrastructure in containers, serving traffic as needed
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/#embeddings-etl","title":"Embeddings & ETL","text":"
                      • Used OpenAI's ada002 model to generate embeddings, which are much cheaper than generation endpoints
                      • Currently using a vector index for data storage, but considering adding additional types of search
                      • Discussed processing PDFs in a previous lecture, mentioned using local code to extract URLs and using a map function with controlled concurrency
                      • Retrieval results are put into the zero-shot problem using an F-string template in LangChain's prompt template
                      • Compared LangChain to Hugging Face Transformers Library as a framework and mentioned that their code is often simple, but valuable for its interface and compatibility with other tools
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/#monitoring-improvement-gantry","title":"Monitoring & Improvement: Gantry","text":"
                      • Top three challenges in bringing the spot to the next level: improving retrieval, improving the quality of model outputs, and identifying a solid user base.
                      • Using tools like Datadog, Sentry, Honeycomb, and Gantry for handling web services, logging, and monitoring model behavior.
                      • The same principle of tracing and monitoring applies to both ML-powered apps and LLM-powered apps.
                      • Gantry provides a useful service for tracking and enriching logged data, including toxicity checks and other natural language-based or numerical analyses.
                      • Using language models to check on the performance and outputs of other language models.
                      • Contributing to the development of the tool as a teaching and learning application is open and encouraged.
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/#swe-tooling-make-precommit-etc","title":"SWE Tooling: make, precommit, etc","text":"
                      • Walked everyone through the code base for the Discord bot they interacted with
                      • Sourced question-answering over a corpus of information using Vector storage for retrieval
                      • GitHub repo available for this project, but may not be able to execute the code without accounts on all services
                      • Makefile created for easier project management, setting up environment and authentication, and running setup commands
                      • Incorporated software tools like pre-commit checks, black for Python auto-formatting, and rust-powered formatter
                      • Shell check tool useful for catching issues in bash scripts
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/#data-cleaning","title":"Data Cleaning","text":"
                      • Initial approach of scraping data and chunking into smaller pieces did not yield good results
                      • Improved results by spending time understanding the data and preserving the structure during processing
                      • Extracting textual information from other sources like images and YouTube videos can enhance the usefulness of language models
                      • Sometimes simple solutions to specific data sources and problems can greatly improve the quality of results
                      • The unglamorous work of getting to know the data and writing code to manage it properly can result in big dividends for language model applications
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/#infrastructure-modal","title":"Infrastructure: Modal","text":"
                      • Discussed the ETL component of extracting, transforming, and loading data from various sources
                      • Discussed using Python packages for data transformation and addressing dependency issues with tools like pre-commit
                      • Explained the benefits of the modal component in creating lightweight virtual containers for different tasks
                      • Modal containers are fast and efficient, aiding in quick development cycles and allowing for containerization without the pains of traditional Docker images
                      • Modal also allows for the creation of serverless applications with auto-scaling and resource management
                      • Debugging and local development can be done through the interactive mode by connecting to a container running on modal
                      • showModal provides an interface for tracking application activity, utilization, and resource allocation, making it a versatile tool for various projects
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/#frontend-gradio-discord","title":"Frontend: Gradio & Discord","text":"
                      • Introduced Gradio user interface, allowing users to create interfaces in pure Python
                      • Gradio UI is flexible, supported by Hugging Face, and rapidly adopting machine learning features
                      • Examples of Gradio UI use include Alpaca, Flamingo, and Dolly mini
                      • Gradio UI is easy to set up, portable, flexible, and comes with an API with OpenAPI spec
                      • Discord bot integrated with Python library Discord.py; alternative library Interactions.py is also available
                      • Gradio UI is built on FastAPI for asynchronous Python web service
                      • Application mainly runs on the model's infrastructure in containers, serving traffic as needed
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/#embeddings-etl","title":"Embeddings & ETL","text":"
                      • Used OpenAI's ada002 model to generate embeddings, which are much cheaper than generation endpoints
                      • Currently using a vector index for data storage, but considering adding additional types of search
                      • Discussed processing PDFs in a previous lecture, mentioned using local code to extract URLs and using a map function with controlled concurrency
                      • Retrieval results are put into the zero-shot problem using an F-string template in LangChain's prompt template
                      • Compared LangChain to Hugging Face Transformers Library as a framework and mentioned that their code is often simple, but valuable for its interface and compatibility with other tools
                      "},{"location":"llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/#monitoring-improvement-gantry","title":"Monitoring & Improvement: Gantry","text":"
                      • Top three challenges in bringing the spot to the next level: improving retrieval, improving the quality of model outputs, and identifying a solid user base.
                      • Using tools like Datadog, Sentry, Honeycomb, and Gantry for handling web services, logging, and monitoring model behavior.
                      • The same principle of tracing and monitoring applies to both ML-powered apps and LLM-powered apps.
                      • Gantry provides a useful service for tracking and enriching logged data, including toxicity checks and other natural language-based or numerical analyses.
                      • Using language models to check on the performance and outputs of other language models.
                      • Contributing to the development of the tool as a teaching and learning application is open and encouraged.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/","title":"Augmented Language Models","text":"

                      Lecture by Josh Tobin. Published May 9, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#why-augmented-lms","title":"Why augmented LMs?","text":"
                      • Language models are good at understanding language, following instructions, basic reasoning, and understanding code, but they lack up-to-date knowledge, specifics about your data, and more complex reasoning abilities.
                      • Think of language models as the \"brain\" that needs tools and data to complete tasks.
                      • Context windows are limited but growing rapidly and putting more context in the model costs money.
                      • There are three ways to augment language models: retrieval, chains, and tools.
                      • Retrieval involves providing an external corpus of data for the model to search, chains use the output of one language model as input for another, and tools allow models to interact with external data sources.
                      • This lecture serves as an introduction to these topics with depth available for further exploration.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#why-retrieval-augmentation","title":"Why retrieval augmentation?","text":"
                      • Discussing retrieval augmentation to give models access to user-specific data
                      • Initial approach: put data into the context (e.g., organizers of an event)
                      • Challenge: thousands of users and complex relationships between queries and users make it difficult to use simple rules/coding
                      • Consider building the context as a form of information retrieval (like search)
                      • Treat putting the right data in the context for the model as a search problem
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#traditional-information-retrieval","title":"Traditional information retrieval","text":"
                      • Traditional information retrieval uses a query to find and rank relevant objects in a collection
                      • Objects can be documents, images, or other types of content
                      • Inverted indexes, which record word frequencies in documents, are often used for search
                      • Relevance is typically determined through Boolean search, while ranking is commonly done using the BM25 algorithm
                      • Factors affecting ranking include search term frequency in the document, number of documents containing the search term, and context within a sentence
                      • Traditional search is limited as it cannot capture semantic information or complex relationships between terms
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#embeddings-for-retrieval","title":"Embeddings for retrieval","text":"
                      • Discussing AI-centric approach for information retrieval via embeddings.
                      • AI helps improve search and retrieve better data from contexts using large language models and embeddings.
                      • Embeddings are abstract, dense, compact, usually fixed-size, and learned representations of data, which could be documents, images, audio, etc.
                      • Good embeddings have utility for the downstream task, and similar objects should be close together in the embedding space, while different objects should be far apart.
                      • Important embeddings to know: Word2Vec, Sentence Transformers, CLIP, OpenAI embeddings (Text Embedding ada002), and Instructor.
                      • Off-the-shelf embeddings are a good start, but fine-tuning and training an embedding model on specific tasks can achieve better results.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#embedding-relevance-and-indexes","title":"Embedding relevance and indexes","text":"
                      • Discussing using embeddings for information retrieval
                      • Can use cosine similarity or dot product similarity as similarity metrics
                      • For nearest neighbor search, can simply use numpy if dealing with less than 100,000 vectors
                      • Approximate nearest neighbor algorithms are useful for faster search at larger scales, with tools like Facebook AI's FAISS, HNSW, and Annoy
                      • Choosing an information retrieval system is more important than the specific embedding index
                      • Limitations of approximate nearest neighbor indices include lack of hosting, data and metadata storage, and scalability
                      • Consider an information retrieval system that addresses these limitations for production use, analogous to having a complete library rather than just a card catalog
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#embedding-databases","title":"Embedding databases","text":"
                      • Searching over vectors may not be great for production, so consider databases for a more reliable and production-oriented approach.
                      • Consider whether you need an embedding database or just a database, as many popular databases already have embedding index built in, such as PG Vector for Postgres, Elasticsearch, and Redis.
                      • Building a system for information retrieval with embeddings involves challenges like scale, reliability, managing the embedding function, specifying queries, and choosing search algorithms.
                      • Don't try to handle all the complexity yourself; use existing embedding databases like Chroma, Milvus, Pinecone, Vespa, and Weaviate.
                      • When choosing an embedding database, consider features like scalability, embedding management, filtering, and integration with traditional full-text search.
                      • General recommendations: use your existing database for prototyping, choose Pinecone for speed of setup, consider Vespa and Weaviate for flexible queries, and Vespa and Milvus for scale and reliability.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#beyond-naive-embeddings","title":"Beyond naive embeddings","text":"
                      • Address issues when queries and documents have different forms and embeddings aren't comparable
                      • Consider training a model that jointly represents both queries and documents for a more \"apples to apples\" comparison
                      • Explore hypothetical document embeddings: have the model imagine a document containing the query's answer and find similar documents
                      • Look into re-ranking techniques: search a large number of documents and train a model to reorder them based on specific criteria
                      • Use new libraries like Lama Index to search more efficiently, respecting the structure of the data and subsets (e.g., Notion database, Twitter, or recent data)
                      • Lama Index combines document retrieval and building embeddings designed for hierarchical searching
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#patterns-case-studies","title":"Patterns & case studies","text":"
                      • Retrieval augmentation case study: Copilot
                      • Two secrets to Copilot: speed and relevant context
                      • Builds context by looking at most recently accessed 20 documents in the same programming language
                      • Post-processing includes looking at code before and after cursor, relevant snippets from candidate docs, and heuristically accessing data
                      • Output generated is a result of sorting by heuristics
                      • Copilot is powerful but uses simple retrieval methods, highlighting the effectiveness of heuristics
                      • Another common pattern: question answering using retrieval augmentation
                      • This involves finding most similar documents/messages to a question and using retrieved information to answer the question
                      • Limitation: search process might not return the documents containing the answer
                      • Solution: use more models and iterate over documents, calling an LLM on each subset and feeding the output to the next model
                      • This approach can be generalized as \"chains\" where models build context for other models
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#what-are-chains-and-why-do-we-need-them","title":"What are chains and why do we need them?","text":"
                      • Discussing ways to add information to the context for language models besides retrieval
                      • Retrieval-based models follow a common question-answering pattern: embedding queries, comparing embeddings to find similar documents, and using context to answer questions
                      • Key limitation: reliance on the retrieval system; if the right information isn't among the retrieved documents, the model can't answer the question
                      • Possible solutions:
                      • Improve the quality of the information retrieval system with advanced search features
                      • Add additional processing, like using another LLM for post-processing retrieved documents, to refine the context (although it might be slower and more expensive)
                      • Introducing the concept of \"chains\": sequencing language model calls where the output of one call is the input to another
                      • Example patterns for building chains:
                      • Question-answering pattern
                      • Hypothetical document embeddings
                      • Summarizing a large corpus through a mapreduce-like process by independently summarizing each document, then summarizing the summaries
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#langchain","title":"LangChain","text":"
                      • Lang chain is an extremely popular tool for building chains of models and one of the fastest growing open source projects
                      • Supports both Python and JavaScript
                      • Fastest way to get started building applications and can be used in production
                      • Many people end up creating their own chaining, possibly inspired by Lang chain
                      • Lang chain provides a repository of different chains for various tasks and offers nice code and abstractions
                      • Ideal for prototyping, but also easy to build your own system if needed for production
                      • Contains many examples of types of chains in their repository, which is useful for generating ideas and learning about chaining patterns
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#tool-use","title":"Tool use","text":"
                      • Building context for language models to answer questions can involve creating a search engine or giving them access to APIs and outside tools
                      • A \"feeling lucky\" chain involves searching Google for an answer, getting the top result, and summarizing the content for the user
                      • Tool_Former paper demonstrates using tools such as calculators, question-answering systems, and translation systems in the training process for language models
                      • Tools can be used deterministically or in a way similar to OpenAI plugins
                      • Examples of tools for language models include archive search, Python interpreters, and SQL query execution
                      • An example chain involves translating a user's natural language question into an SQL query, executing the query, and providing the response back to the user
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#plugins","title":"Plugins","text":"
                      • There is a more automated approach called plugins to allow language models to interact with external tools.
                      • In a chain-based approach, developers manually design the interaction pattern between language model and tool by passing queries through a series of steps.
                      • In a plugin-based approach, the language model gets to decide whether to use a tool or not. A simpler method is used in Tool Former and OpenAI plugins.
                      • To create an OpenAI plugin, provide an API spec and a description of the API meant for the language model to decide when to use it.
                      • OpenAI passes the description as part of the context to the model, enabling it to make decisions based on user inputs and the API's usefulness.
                      • The model can invoke the API, and results are fed into the context allowing the language model to continue answering user questions.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#recommendations-for-tool-use","title":"Recommendations for tool use","text":"
                      • Tools are a flexible way to augment language models with external data.
                      • Retrieval systems are one example of a tool that can be based on various databases.
                      • Two ways to build tool use into language models: manually describe the logic (chains) or use plugins and let the model figure it out.
                      • Chains are better for reliability and consistent problem-solving.
                      • Plugins are more suitable for interactivity, flexibility, and general-purpose use, allowing users to solve various unanticipated problems.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/#recap-conclusions","title":"Recap & conclusions","text":"
                      • LMS are more powerful when connected to external data
                      • Rules and heuristics can help identify useful data
                      • As knowledge base scales, consider it as information retrieval
                      • Chains can encode complex reasoning and help with token limits
                      • Tools can provide access to external knowledge beyond internal database
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#why-augmented-lms","title":"Why augmented LMs?","text":"
                      • Language models are good at understanding language, following instructions, basic reasoning, and understanding code, but they lack up-to-date knowledge, specifics about your data, and more complex reasoning abilities.
                      • Think of language models as the \"brain\" that needs tools and data to complete tasks.
                      • Context windows are limited but growing rapidly and putting more context in the model costs money.
                      • There are three ways to augment language models: retrieval, chains, and tools.
                      • Retrieval involves providing an external corpus of data for the model to search, chains use the output of one language model as input for another, and tools allow models to interact with external data sources.
                      • This lecture serves as an introduction to these topics with depth available for further exploration.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#why-retrieval-augmentation","title":"Why retrieval augmentation?","text":"
                      • Discussing retrieval augmentation to give models access to user-specific data
                      • Initial approach: put data into the context (e.g., organizers of an event)
                      • Challenge: thousands of users and complex relationships between queries and users make it difficult to use simple rules/coding
                      • Consider building the context as a form of information retrieval (like search)
                      • Treat putting the right data in the context for the model as a search problem
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#traditional-information-retrieval","title":"Traditional information retrieval","text":"
                      • Traditional information retrieval uses a query to find and rank relevant objects in a collection
                      • Objects can be documents, images, or other types of content
                      • Inverted indexes, which record word frequencies in documents, are often used for search
                      • Relevance is typically determined through Boolean search, while ranking is commonly done using the BM25 algorithm
                      • Factors affecting ranking include search term frequency in the document, number of documents containing the search term, and context within a sentence
                      • Traditional search is limited as it cannot capture semantic information or complex relationships between terms
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#embeddings-for-retrieval","title":"Embeddings for retrieval","text":"
                      • Discussing AI-centric approach for information retrieval via embeddings.
                      • AI helps improve search and retrieve better data from contexts using large language models and embeddings.
                      • Embeddings are abstract, dense, compact, usually fixed-size, and learned representations of data, which could be documents, images, audio, etc.
                      • Good embeddings have utility for the downstream task, and similar objects should be close together in the embedding space, while different objects should be far apart.
                      • Important embeddings to know: Word2Vec, Sentence Transformers, CLIP, OpenAI embeddings (Text Embedding ada002), and Instructor.
                      • Off-the-shelf embeddings are a good start, but fine-tuning and training an embedding model on specific tasks can achieve better results.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#embedding-relevance-and-indexes","title":"Embedding relevance and indexes","text":"
                      • Discussing using embeddings for information retrieval
                      • Can use cosine similarity or dot product similarity as similarity metrics
                      • For nearest neighbor search, can simply use numpy if dealing with less than 100,000 vectors
                      • Approximate nearest neighbor algorithms are useful for faster search at larger scales, with tools like Facebook AI's FAISS, HNSW, and Annoy
                      • Choosing an information retrieval system is more important than the specific embedding index
                      • Limitations of approximate nearest neighbor indices include lack of hosting, data and metadata storage, and scalability
                      • Consider an information retrieval system that addresses these limitations for production use, analogous to having a complete library rather than just a card catalog
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#embedding-databases","title":"Embedding databases","text":"
                      • Searching over vectors may not be great for production, so consider databases for a more reliable and production-oriented approach.
                      • Consider whether you need an embedding database or just a database, as many popular databases already have embedding index built in, such as PG Vector for Postgres, Elasticsearch, and Redis.
                      • Building a system for information retrieval with embeddings involves challenges like scale, reliability, managing the embedding function, specifying queries, and choosing search algorithms.
                      • Don't try to handle all the complexity yourself; use existing embedding databases like Chroma, Milvus, Pinecone, Vespa, and Weaviate.
                      • When choosing an embedding database, consider features like scalability, embedding management, filtering, and integration with traditional full-text search.
                      • General recommendations: use your existing database for prototyping, choose Pinecone for speed of setup, consider Vespa and Weaviate for flexible queries, and Vespa and Milvus for scale and reliability.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#beyond-naive-embeddings","title":"Beyond naive embeddings","text":"
                      • Address issues when queries and documents have different forms and embeddings aren't comparable
                      • Consider training a model that jointly represents both queries and documents for a more \"apples to apples\" comparison
                      • Explore hypothetical document embeddings: have the model imagine a document containing the query's answer and find similar documents
                      • Look into re-ranking techniques: search a large number of documents and train a model to reorder them based on specific criteria
                      • Use new libraries like Lama Index to search more efficiently, respecting the structure of the data and subsets (e.g., Notion database, Twitter, or recent data)
                      • Lama Index combines document retrieval and building embeddings designed for hierarchical searching
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#patterns-case-studies","title":"Patterns & case studies","text":"
                      • Retrieval augmentation case study: Copilot
                      • Two secrets to Copilot: speed and relevant context
                      • Builds context by looking at most recently accessed 20 documents in the same programming language
                      • Post-processing includes looking at code before and after cursor, relevant snippets from candidate docs, and heuristically accessing data
                      • Output generated is a result of sorting by heuristics
                      • Copilot is powerful but uses simple retrieval methods, highlighting the effectiveness of heuristics
                      • Another common pattern: question answering using retrieval augmentation
                      • This involves finding most similar documents/messages to a question and using retrieved information to answer the question
                      • Limitation: search process might not return the documents containing the answer
                      • Solution: use more models and iterate over documents, calling an LLM on each subset and feeding the output to the next model
                      • This approach can be generalized as \"chains\" where models build context for other models
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#what-are-chains-and-why-do-we-need-them","title":"What are chains and why do we need them?","text":"
                      • Discussing ways to add information to the context for language models besides retrieval
                      • Retrieval-based models follow a common question-answering pattern: embedding queries, comparing embeddings to find similar documents, and using context to answer questions
                      • Key limitation: reliance on the retrieval system; if the right information isn't among the retrieved documents, the model can't answer the question
                      • Possible solutions:
                      • Improve the quality of the information retrieval system with advanced search features
                      • Add additional processing, like using another LLM for post-processing retrieved documents, to refine the context (although it might be slower and more expensive)
                      • Introducing the concept of \"chains\": sequencing language model calls where the output of one call is the input to another
                      • Example patterns for building chains:
                      • Question-answering pattern
                      • Hypothetical document embeddings
                      • Summarizing a large corpus through a mapreduce-like process by independently summarizing each document, then summarizing the summaries
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#langchain","title":"LangChain","text":"
                      • Lang chain is an extremely popular tool for building chains of models and one of the fastest growing open source projects
                      • Supports both Python and JavaScript
                      • Fastest way to get started building applications and can be used in production
                      • Many people end up creating their own chaining, possibly inspired by Lang chain
                      • Lang chain provides a repository of different chains for various tasks and offers nice code and abstractions
                      • Ideal for prototyping, but also easy to build your own system if needed for production
                      • Contains many examples of types of chains in their repository, which is useful for generating ideas and learning about chaining patterns
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#tool-use","title":"Tool use","text":"
                      • Building context for language models to answer questions can involve creating a search engine or giving them access to APIs and outside tools
                      • A \"feeling lucky\" chain involves searching Google for an answer, getting the top result, and summarizing the content for the user
                      • Tool_Former paper demonstrates using tools such as calculators, question-answering systems, and translation systems in the training process for language models
                      • Tools can be used deterministically or in a way similar to OpenAI plugins
                      • Examples of tools for language models include archive search, Python interpreters, and SQL query execution
                      • An example chain involves translating a user's natural language question into an SQL query, executing the query, and providing the response back to the user
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#plugins","title":"Plugins","text":"
                      • There is a more automated approach called plugins to allow language models to interact with external tools.
                      • In a chain-based approach, developers manually design the interaction pattern between language model and tool by passing queries through a series of steps.
                      • In a plugin-based approach, the language model gets to decide whether to use a tool or not. A simpler method is used in Tool Former and OpenAI plugins.
                      • To create an OpenAI plugin, provide an API spec and a description of the API meant for the language model to decide when to use it.
                      • OpenAI passes the description as part of the context to the model, enabling it to make decisions based on user inputs and the API's usefulness.
                      • The model can invoke the API, and results are fed into the context allowing the language model to continue answering user questions.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#recommendations-for-tool-use","title":"Recommendations for tool use","text":"
                      • Tools are a flexible way to augment language models with external data.
                      • Retrieval systems are one example of a tool that can be based on various databases.
                      • Two ways to build tool use into language models: manually describe the logic (chains) or use plugins and let the model figure it out.
                      • Chains are better for reliability and consistent problem-solving.
                      • Plugins are more suitable for interactivity, flexibility, and general-purpose use, allowing users to solve various unanticipated problems.
                      "},{"location":"llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/#recap-conclusions","title":"Recap & conclusions","text":"
                      • LMS are more powerful when connected to external data
                      • Rules and heuristics can help identify useful data
                      • As knowledge base scales, consider it as information retrieval
                      • Chains can encode complex reasoning and help with token limits
                      • Tools can provide access to external knowledge beyond internal database
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/","title":"Harrison Chase: Agents","text":"

                      Lecture by Harrison Chase. Published May 25, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/chase-agents/#what-are-agents","title":"What are \"agents\"?","text":"
                      • The lecture covers agents and their significance in the context of LangChain.
                      • The core idea of agents is using a language model as a reasoning engine to determine how to interact with the outside world based on user input
                      • First it defines what agents are, explains why they are used, and shows how they are typically implemented.
                      • It also considers the challenges associated with getting agents to work reliably in production.
                      • It touches on memory and recent projects that involve agentic behavior
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#why-use-agents","title":"Why use agents?","text":"
                      • Agents are useful for connecting language models to external sources of data and computation, such as search APIs and databases.
                      • Agents are more flexible and powerful than simply connecting language models to tools, and can handle edge cases and multi-hop tasks better.
                      • The typical implementation of agents involves using the language model to choose a tool, taking action with that tool, observing the output, and feeding it back into the language model until a stopping condition is met.
                      • Stopping conditions can be set by the language model or through hard-coded rules.
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#react-reasoning-to-act","title":"ReAct: Reasoning to Act","text":"
                      • ReAct is a prompting strategy for natural language processing
                      • It stands for \"Reasoning and Acting\"
                      • It combines Chain-of-Thought reasoning and action-taking to improve the language model's ability to reason and access real data sources
                      • It yields higher quality, more reliable results than other prompting techniques
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#challenge-controlling-tool-use","title":"Challenge: controlling tool use","text":"
                      • React is a popular implementation of agency, but there are many challenges
                      • One challenge is getting agents to use tools appropriately, which can be addressed by providing tool descriptions or using tool retrieval
                      • Few-shot examples can guide the language model in what to do
                      • Another challenge is getting agents not to use tools when they don't need to, which can be addressed with reminders or adding a tool that explicitly returns to the user
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#challenge-parsing-tool-invocations","title":"Challenge: parsing tool invocations","text":"
                      • Language models return raw strings, and we often want to pass those strings into other programs
                      • More structured responses, like those in JSON format, are easier to parse
                      • Output parsers are used to encapsulate the logic needed to parse responses, can be modular, and can retry mistakes
                      • There are subtle differences in fixing errors in response outputs, and output parsers can help with this task
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#challenge-long-term-memory-and-coherence","title":"Challenge: long-term memory and coherence","text":"
                      • Fourth challenge is getting agents to remember previous steps
                      • ReAct paper keeps a list of these steps in memory
                      • Long-running tasks present context window issues
                      • Retrieval methods can fetch previous steps and put them into context
                      • Combining some N most recent and some K most relevant actions and observations is common
                      • Incorporating big and hard-to-parse API responses is a challenge
                      • Custom logic can be used to select relevant keys and put them in context
                      • Tool usage requires thinking about output size
                      • Agents can go off track, and reiterating the objective can help
                      • Separating planning and execution steps can help break down objectives
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#challenge-evaluation","title":"Challenge: evaluation","text":"
                      • Evaluating language models and applications built on top is difficult
                      • Evaluating agents is also difficult
                      • One way to evaluate agents is to measure if the correct result was produced
                      • Another way to evaluate agents is to assess if the agent trajectory or intermediate steps were correct and efficient. Examples include evaluating correct input to action, correct number of steps, and the most efficient sequence of steps.
                      • Evaluating the intermediate steps can be just as useful as evaluating the final result.
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#agent-memory-and-adaptability","title":"Agent memory and adaptability","text":"
                      • Memory is an interesting aspect of AI, especially in the context of user-AI interactions and personalization.
                      • Personalization can be achieved by encoding an agent's objectives and persona in the prompt, but there is also work being done on evolving that over time to give agents a sense of long-term memory.
                      • Memory is becoming increasingly important in the concept of agents as encapsulated programs that adapt over time.
                      • Four recent projects build upon and improve the \"react-style\" agent, discussed next
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#example-autogpt","title":"Example: AutoGPT","text":"

                      points:

                      • ReAct-style agents are designed to solve a specific objective, with short-lived, immediately quantifiable goals
                      • AutoGPT was created for long-running, open-ended goals such as increasing Twitter following
                      • AutoGPT introduced the concept of long-term memory using a vector store due to the long-running nature of its projects
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#example-babyagi","title":"Example: BabyAGI","text":"
                      • BabyAGI is another popular project for agents for long-running objectives
                      • Introduces separate planning and execution steps to improve long-running objectives
                      • BabyAGI initially didn't have tools, but now has them
                      • Separating planning and execution steps can improve reliability and focus of longer-term agents
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#example-camel","title":"Example: CAMEL","text":"
                      • CAMEL paper involves two agents working together, novel idea
                      • The main point of the paper is the use of a simulation environment
                      • Simulation environments can be used for practical evaluation of agents or for entertainment
                      • The paper's results are for a simple \"simulation environment\" -- two agents interacting in a chat room
                      • The agents were language models without tools
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/#example-generative-agents-paper","title":"Example: \"Generative Agents\" paper","text":"
                      • Recent simulation environment that had 25 agents in a Sims-like world
                      • Memory refers to remembering previous events to inform future actions
                      • Three components of memory retrieval: time weighting, importance weighting, relevancy weighting
                      • Reflection step introduced to update different states of the world after observing recent events
                      • Reflection step could be applied to other memory types in LangChain, such as entity memory and summary conversation memory
                      • Other papers recently incorporated the idea of reflection, which is interesting and worth keeping an eye on for the future
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#what-are-agents","title":"What are \"agents\"?","text":"
                      • The lecture covers agents and their significance in the context of LangChain.
                      • The core idea of agents is using a language model as a reasoning engine to determine how to interact with the outside world based on user input
                      • First it defines what agents are, explains why they are used, and shows how they are typically implemented.
                      • It also considers the challenges associated with getting agents to work reliably in production.
                      • It touches on memory and recent projects that involve agentic behavior
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#why-use-agents","title":"Why use agents?","text":"
                      • Agents are useful for connecting language models to external sources of data and computation, such as search APIs and databases.
                      • Agents are more flexible and powerful than simply connecting language models to tools, and can handle edge cases and multi-hop tasks better.
                      • The typical implementation of agents involves using the language model to choose a tool, taking action with that tool, observing the output, and feeding it back into the language model until a stopping condition is met.
                      • Stopping conditions can be set by the language model or through hard-coded rules.
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#react-reasoning-to-act","title":"ReAct: Reasoning to Act","text":"
                      • ReAct is a prompting strategy for natural language processing
                      • It stands for \"Reasoning and Acting\"
                      • It combines Chain-of-Thought reasoning and action-taking to improve the language model's ability to reason and access real data sources
                      • It yields higher quality, more reliable results than other prompting techniques
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#challenge-controlling-tool-use","title":"Challenge: controlling tool use","text":"
                      • React is a popular implementation of agency, but there are many challenges
                      • One challenge is getting agents to use tools appropriately, which can be addressed by providing tool descriptions or using tool retrieval
                      • Few-shot examples can guide the language model in what to do
                      • Another challenge is getting agents not to use tools when they don't need to, which can be addressed with reminders or adding a tool that explicitly returns to the user
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#challenge-parsing-tool-invocations","title":"Challenge: parsing tool invocations","text":"
                      • Language models return raw strings, and we often want to pass those strings into other programs
                      • More structured responses, like those in JSON format, are easier to parse
                      • Output parsers are used to encapsulate the logic needed to parse responses, can be modular, and can retry mistakes
                      • There are subtle differences in fixing errors in response outputs, and output parsers can help with this task
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#challenge-long-term-memory-and-coherence","title":"Challenge: long-term memory and coherence","text":"
                      • Fourth challenge is getting agents to remember previous steps
                      • ReAct paper keeps a list of these steps in memory
                      • Long-running tasks present context window issues
                      • Retrieval methods can fetch previous steps and put them into context
                      • Combining some N most recent and some K most relevant actions and observations is common
                      • Incorporating big and hard-to-parse API responses is a challenge
                      • Custom logic can be used to select relevant keys and put them in context
                      • Tool usage requires thinking about output size
                      • Agents can go off track, and reiterating the objective can help
                      • Separating planning and execution steps can help break down objectives
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#challenge-evaluation","title":"Challenge: evaluation","text":"
                      • Evaluating language models and applications built on top is difficult
                      • Evaluating agents is also difficult
                      • One way to evaluate agents is to measure if the correct result was produced
                      • Another way to evaluate agents is to assess if the agent trajectory or intermediate steps were correct and efficient. Examples include evaluating correct input to action, correct number of steps, and the most efficient sequence of steps.
                      • Evaluating the intermediate steps can be just as useful as evaluating the final result.
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#agent-memory-and-adaptability","title":"Agent memory and adaptability","text":"
                      • Memory is an interesting aspect of AI, especially in the context of user-AI interactions and personalization.
                      • Personalization can be achieved by encoding an agent's objectives and persona in the prompt, but there is also work being done on evolving that over time to give agents a sense of long-term memory.
                      • Memory is becoming increasingly important in the concept of agents as encapsulated programs that adapt over time.
                      • Four recent projects build upon and improve the \"react-style\" agent, discussed next
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#example-autogpt","title":"Example: AutoGPT","text":"

                      points:

                      • ReAct-style agents are designed to solve a specific objective, with short-lived, immediately quantifiable goals
                      • AutoGPT was created for long-running, open-ended goals such as increasing Twitter following
                      • AutoGPT introduced the concept of long-term memory using a vector store due to the long-running nature of its projects
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#example-babyagi","title":"Example: BabyAGI","text":"
                      • BabyAGI is another popular project for agents for long-running objectives
                      • Introduces separate planning and execution steps to improve long-running objectives
                      • BabyAGI initially didn't have tools, but now has them
                      • Separating planning and execution steps can improve reliability and focus of longer-term agents
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#example-camel","title":"Example: CAMEL","text":"
                      • CAMEL paper involves two agents working together, novel idea
                      • The main point of the paper is the use of a simulation environment
                      • Simulation environments can be used for practical evaluation of agents or for entertainment
                      • The paper's results are for a simple \"simulation environment\" -- two agents interacting in a chat room
                      • The agents were language models without tools
                      "},{"location":"llm-bootcamp/spring-2023/chase-agents/chapter_summaries/#example-generative-agents-paper","title":"Example: \"Generative Agents\" paper","text":"
                      • Recent simulation environment that had 25 agents in a Sims-like world
                      • Memory refers to remembering previous events to inform future actions
                      • Three components of memory retrieval: time weighting, importance weighting, relevancy weighting
                      • Reflection step introduced to update different states of the world after observing recent events
                      • Reflection step could be applied to other memory types in LangChain, such as entity memory and summary conversation memory
                      • Other papers recently incorporated the idea of reflection, which is interesting and worth keeping an eye on for the future
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/","title":"Launch an LLM App in One Hour","text":"

                      Lecture by Charles Frye. Published May 9, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/#why-now","title":"Why now?","text":"
                      • Excitement about large language models and artificial intelligence is high, especially since one tool can now accomplish tasks that previously required multiple specialized tools.
                      • Language user interfaces (LUIs) enable more natural interaction with computers through speech and natural language. Large language models, like GPT-3, make LUIs more flexible and capable.
                      • Products and applications are being built with these models, including OpenAI's ChatGPT and GitHub Copilot, hinting at a promising future.
                      • However, the gap between demos and actual products is significant. Overpromising and underdelivering in the past led to \"AI winters,\" so it's important to create valuable products and tools to maintain funding and interest.
                      • The playbook for building applications with language models is emerging, and this boot camp will cover aspects of that process.
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/#prototyping-iteration-in-a-playground","title":"Prototyping & Iteration in a Playground","text":"
                      • Attended various hackathons focused on using machine learning tools
                      • Explored the potential of high-capability hosted models, such as OpenAI's, in a simple chat interface to quickly test capabilities
                      • Used a notebook environment for quick tinkering, building prototypes, and discovering limitations of language models
                      • Started with a problem statement: using large language models to learn about large language models
                      • Discovered difficulties with language models, such as having outdated and limited information
                      • Found that providing specific sources or papers can help improve answers from the model
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/#prototyping-iteration-in-a-notebook","title":"Prototyping & Iteration in a Notebook","text":"
                      • Experiment with automating steps in ephemeral notebook environment like Collab.
                      • OpenAI API allows interaction with language models and offers various SDKs.
                      • Lang chain is a popular open-source framework for interacting with these models; it's fast-evolving and provides all necessary components.
                      • Develop a process to find information and bring it to context. Utilize Python libraries like archive for data sourcing.
                      • Utilize document loaders, such as the one built into Lang chain, to extract content from PDFs.
                      • Use embedding search for large scale information retrieval within documents.
                      • Prototype and tinker with language models to constantly improve them.
                      • Look for similar existing projects to jump off or even default examples provided, such as Lang chain's default example.
                      • Turn these experiments into something usable by people at a larger scale.
                      • The workflow with modern language models is more flexible and faster compared to the past machine learning processes.
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/#deploying-an-mvp","title":"Deploying an MVP","text":"
                      • Building an MVP version of an application requires focusing on what's useful to a broad range of users.
                      • Prioritize the user interface and gather feedback from users quickly.
                      • Cloud-native tooling and serverless infrastructure like Model are helpful in swiftly scaling applications and addressing data processing bottlenecks.
                      • Use various tech stacks for different tasks, such as OpenAI for language models, Pinecone for quick search, MongoDB for data storage, and AWS for running lightweight Discord bot servers.
                      • Implement the application, then monitor usage data to make improvements and learn from successes and failures.
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/#why-now","title":"Why now?","text":"
                      • Excitement about large language models and artificial intelligence is high, especially since one tool can now accomplish tasks that previously required multiple specialized tools.
                      • Language user interfaces (LUIs) enable more natural interaction with computers through speech and natural language. Large language models, like GPT-3, make LUIs more flexible and capable.
                      • Products and applications are being built with these models, including OpenAI's ChatGPT and GitHub Copilot, hinting at a promising future.
                      • However, the gap between demos and actual products is significant. Overpromising and underdelivering in the past led to \"AI winters,\" so it's important to create valuable products and tools to maintain funding and interest.
                      • The playbook for building applications with language models is emerging, and this boot camp will cover aspects of that process.
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/#prototyping-iteration-in-a-playground","title":"Prototyping & Iteration in a Playground","text":"
                      • Attended various hackathons focused on using machine learning tools
                      • Explored the potential of high-capability hosted models, such as OpenAI's, in a simple chat interface to quickly test capabilities
                      • Used a notebook environment for quick tinkering, building prototypes, and discovering limitations of language models
                      • Started with a problem statement: using large language models to learn about large language models
                      • Discovered difficulties with language models, such as having outdated and limited information
                      • Found that providing specific sources or papers can help improve answers from the model
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/#prototyping-iteration-in-a-notebook","title":"Prototyping & Iteration in a Notebook","text":"
                      • Experiment with automating steps in ephemeral notebook environment like Collab.
                      • OpenAI API allows interaction with language models and offers various SDKs.
                      • Lang chain is a popular open-source framework for interacting with these models; it's fast-evolving and provides all necessary components.
                      • Develop a process to find information and bring it to context. Utilize Python libraries like archive for data sourcing.
                      • Utilize document loaders, such as the one built into Lang chain, to extract content from PDFs.
                      • Use embedding search for large scale information retrieval within documents.
                      • Prototype and tinker with language models to constantly improve them.
                      • Look for similar existing projects to jump off or even default examples provided, such as Lang chain's default example.
                      • Turn these experiments into something usable by people at a larger scale.
                      • The workflow with modern language models is more flexible and faster compared to the past machine learning processes.
                      "},{"location":"llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/#deploying-an-mvp","title":"Deploying an MVP","text":"
                      • Building an MVP version of an application requires focusing on what's useful to a broad range of users.
                      • Prioritize the user interface and gather feedback from users quickly.
                      • Cloud-native tooling and serverless infrastructure like Model are helpful in swiftly scaling applications and addressing data processing bottlenecks.
                      • Use various tech stacks for different tasks, such as OpenAI for language models, Pinecone for quick search, MongoDB for data storage, and AWS for running lightweight Discord bot servers.
                      • Implement the application, then monitor usage data to make improvements and learn from successes and failures.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/","title":"LLM Foundations","text":"

                      Lecture by Sergey Karayev. Published May 19, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/llm-foundations/#intro","title":"Intro","text":"
                      • Discuss four key ideas in machine learning
                      • Address diverse audience, including experts, executives, and investors
                      • Cover Transformer architecture
                      • Mention notable LLMs (e.g., GPT, T5, BERT, etc.)
                      • Share details on running a Transformer
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#foundations-of-machine-learning","title":"Foundations of Machine Learning","text":"
                      • Machine learning has shifted from traditional programming (Software 1.0) to a Software 2.0 mindset, where algorithms are generated from training data and more emphasis is placed on the training system.
                      • Three types of machine learning include unsupervised learning, supervised learning, and reinforcement learning, which have mostly converged to a supervised learning approach.
                      • For machines, input and output are always just numbers, represented as vectors or matrices.
                      • One dominant approach to machine learning today is neural networks, also known as deep learning, which was inspired by the human brain's structure and function.
                      • Neural networks consist of perceptrons connected in layers, and all operations are matrix multiplications.
                      • GPUs, originally developed for graphics and video games, have played a significant role in advancing deep learning due to their compatibility with matrix multiplications.
                      • To train a neural network, data is typically split into training, validation, and test sets to avoid overfitting and improve model performance.
                      • Pre-training involves training a large model on extensive data, which can then be fine-tuned using smaller sets of specialized data for better performance.
                      • Model hubs, such as Hugging Face, offer numerous pre-trained models for various machine learning tasks and have seen significant growth in recent years.
                      • The Transformer model has become the dominant architecture for a wide range of machine learning tasks.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#the-transformer-architecture","title":"The Transformer Architecture","text":"
                      • Transformer architecture introduced in 2017 paper \"Attention is All You Need\"
                      • Set state-of-the-art results in translation tasks
                      • Applied to other NLP tasks and fields like vision
                      • Appears complicated but consists of two similar halves
                      • Focusing on one half called the decoder
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#transformer-decoder-overview","title":"Transformer Decoder Overview","text":"
                      • The task of the Transformer decoder is to complete text, much like GPT models.
                      • The input consists of a sequence of tokens (e.g., \"it's a blue\"), and the goal is to predict the next word (e.g., \"sundress\").
                      • The output is a probability distribution over potential next tokens.
                      • Inference involves sampling a token from the distribution, appending it to the input, and running the model again with the updated input.
                      • ChatGPT operates by seeing user input, sampling the next word, appending it, and repeating this process.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#inputs","title":"Inputs","text":"
                      • Inputs need to be vectors of numbers
                      • Text is turned into vectors through tokenization
                      • Tokens are assigned an ID in a vocabulary, rather than being words
                      • Numbers are represented as vectors using one-hot encoding (e.g., number 3 represented by a vector with 1 in third position, zeros everywhere else)
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#input-embedding","title":"Input Embedding","text":"
                      • One-hot vectors are not good representations of words or tokens as they don't capture the notion of similarity between words
                      • To address the issue, we use embedding
                      • Embedding involves learning an embedding matrix which converts a one-hot vocabulary encoding into a dense vector of chosen dimensionalities
                      • This process turns words into dense embeddings, making it the simplest neural network layer type
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#masked-multi-head-attention","title":"Masked Multi-Head Attention","text":"
                      • Attention was introduced in 2015 for translation tasks, and the idea is to predict the most likely next token based on the importance of previous tokens.
                      • Attention mechanism involves an output as a weighted sum of input vectors, and these weights are calculated using dot products (similarities) between the input vectors.
                      • Each input vector plays three roles in the attention mechanism: as a query, key, and value.
                      • To learn and improve attention, input vectors can be projected into different roles (query, key, and value) by multiplying them with learnable matrices.
                      • Multi-head attention refers to learning several different ways of transforming inputs into queries, keys, and values simultaneously.
                      • Masking is used to prevent the model from \"cheating\" by considering future tokens; it ensures that the model only predicts the next token based on the already seen input.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#positional-encoding","title":"Positional Encoding","text":"
                      • No notion of position in the current model, only whether something has been seen or not.
                      • Positional encoding is introduced to provide ordering among the seen elements.
                      • Current equations resemble a bag of unordered items.
                      • Positional encoding vectors are added to embedding vectors to provide order.
                      • Seems counterintuitive, but it works; attention mechanism figures out relevant positions.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#skip-connections-and-layer-norm","title":"Skip Connections and Layer Norm","text":"
                      • Add up and norm attention outputs using skip connections and layer normalization
                      • Skip connections help propagate loss from end to beginning of model during backpropagation
                      • Layer normalization resets mean and standard deviation to uniform after every operation
                      • Input embedding determines the dimension of the entire Transformer model
                      • Normalization seems inelegant but is very effective in improving neural net learning
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#feed-forward-layer","title":"Feed-forward Layer","text":"
                      • Feed forward layer is similar to the standard multi-layer perceptron.
                      • It receives tokens augmented with relevant information.
                      • The layer upgrades the token representation.
                      • The process goes from word-level to thought-level, with more semantic meaning.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#transformer-hyperparameters-and-why-they-work-so-well","title":"Transformer hyperparameters and Why they work so well","text":"
                      • GPT-3 model ranges from 12 to 96 layers of Transformer layers with adjustable embedding dimensions and attention heads, totaling 175 billion parameters.
                      • Most of GPT-3's parameters are in the feed forward layer, but for smaller models, a significant portion is in embedding and attention.
                      • Transformers are effective general-purpose differentiable computers that are expressive, optimizable via backpropagation, and efficient due to parallel processing.
                      • Understanding exact expressiveness of the Transformer is ongoing, with interesting results like RASP (a programming language designed to be implemented within a Transformer).
                      • Decompiling Transformer weights back to a program is still an unsolved problem.
                      • Multiple attention heads allow the model to figure out how to use a second head, showcased in work like Induction Heads.
                      • Learning to code Transformers isn't necessary for AI-powered products, but can be fun and educational. Resources like YouTube tutorials and code examples are available to assist in learning.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#notable-llm-bert","title":"Notable LLM: BERT","text":"
                      • Bert, T5, and GPT cover the gamut of large Transformer models
                      • Bert stands for bi-directional encoder representation from Transformers
                      • Bert uses the encoder part of the Transformer, with unmasked attention
                      • Bert contains 100 million parameters, considered large at its time
                      • Bert was trained by masking 15% of words in a text corpus and predicting the masked words
                      • Bert became a building block for other NLP applications
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#notable-llm-t5","title":"Notable LLM: T5","text":"
                      • T5 applies Transformer architecture to text-to-text transfer, meaning both input and output are text strings
                      • The task is encoded in the input string and can involve translation, summarization, etc.
                      • Encoder-decoder architecture was found to be best, with 11 billion parameters
                      • Trained on Colossal Queen Crawl Corpus (C4) derived from Common Crawl dataset
                      • C4 was created by filtering out short pages, offensive content, pages with code, and de-duplicating data
                      • Fine-tuned using academic supervised tasks for various NLP applications
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#notable-llm-gpt","title":"Notable LLM: GPT","text":"
                      • GPT is a generative pre-trained Transformer, with GPT-2 being decoder only
                      • GPT-2 was trained on a dataset called WebText created by scraping links from Reddit
                      • GPT tokenizes text using byte pair encoding, a middle ground between old-school tokenization and using UTF-8 bytes
                      • GPT-3 came out in 2020 and is 100 times larger than GPT-2, enabling few-shot and zero-shot learning
                      • GPT-3 was trained on webtext, raw common crawl data, a selection of books, and all of Wikipedia
                      • The dataset for GPT-3 contained 500 billion tokens, but it was only trained on 300 billion tokens
                      • GPT-4 details are unknown, but it is assumed to be much larger than previous versions due to the trend in increasing size
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#notable-llm-chinchilla-and-scaling-laws","title":"Notable LLM: Chinchilla and Scaling Laws","text":"
                      • Using more computation to train AI systems improves their performance
                      • Rich Sutton's \"bitter lesson\": advantage goes to those stacking more layers
                      • DeepMind's paper, Training Compute Optimal LLMs: studied relationship between model size, compute and data set size
                      • Most LLMs in literature had too many parameters for their data amount
                      • Chinchilla model (70 billion) outperformed Gopher model (four times larger) by training on 1.4 trillion tokens instead of 300 billion
                      • Open question: can models continue to improve by training repeatedly on existing data?
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#notable-llm-llama","title":"Notable LLM: LLaMA","text":"
                      • Llama is an open-source chinchilla optimal LLM from Meta Research
                      • Several sizes available, ranging from 7 billion to 65 billion, with at least 1 trillion tokens
                      • Competitively benchmarks against GPT-3 and other state-of-the-art LLMs
                      • Open source but non-commercial license for pre-trained weights
                      • Trained on custom common crawl filtering, C4, GitHub, Wikipedia, books, and scientific papers
                      • Data set replicated by Red Pajama, which is also training models to replicate Llama
                      • Interesting inclusion of GitHub as a training resource
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#why-include-code-in-llm-training-data","title":"Why include code in LLM training data?","text":"
                      • Including code in training data can improve performance on non-code tasks
                      • OpenAI found this with their Codex model, which was fine-tuned on code and outperformed GPT-3 on reasoning tasks
                      • Since then, people have been adding code to training data
                      • Open source dataset called 'the stack' collects code from GitHub while respecting licenses
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#instruction-tuning","title":"Instruction Tuning","text":"
                      • Discusses instruction tuning in GPT models and its impact on performance
                      • Mentions the shift from text completion mindset to instruction following mindset
                      • Supervised fine-tuning helps models become better at zero-shot tasks by using data sets of zero-shot inputs and desired outputs
                      • OpenAI hired thousands of contractors to gather zero-shot data and used reinforcement learning for training
                      • GPT model lineage includes DaVinci, Codex, and various iterations, fine-tuning for specific applications
                      • Fine-tuning imposes an \"alignment tax,\" decreasing few-shot learning ability and model's confidence calibration
                      • Llama model by Stanford team used GPT-3 generated instructions, costing less but with reduced performance compared to GPT-3
                      • A specific data set for instruction tuning in chat-based paradigms is called \"Open Assistant\"
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/#notable-llm-retro","title":"Notable LLM: RETRO","text":"
                      • Discussing a model called \"retrieval enhancing\" from DeepMind
                      • Goal: train a smaller model good at reasoning and writing code, but looks up facts from a database
                      • Used \"burden-coded\" sentences in a trillion-token database for fact retrieval
                      • Not as effective as large language models yet, but shows potential for the future
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#intro","title":"Intro","text":"
                      • Discuss four key ideas in machine learning
                      • Address diverse audience, including experts, executives, and investors
                      • Cover Transformer architecture
                      • Mention notable LLMs (e.g., GPT, T5, BERT, etc.)
                      • Share details on running a Transformer
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#foundations-of-machine-learning","title":"Foundations of Machine Learning","text":"
                      • Machine learning has shifted from traditional programming (Software 1.0) to a Software 2.0 mindset, where algorithms are generated from training data and more emphasis is placed on the training system.
                      • Three types of machine learning include unsupervised learning, supervised learning, and reinforcement learning, which have mostly converged to a supervised learning approach.
                      • For machines, input and output are always just numbers, represented as vectors or matrices.
                      • One dominant approach to machine learning today is neural networks, also known as deep learning, which was inspired by the human brain's structure and function.
                      • Neural networks consist of perceptrons connected in layers, and all operations are matrix multiplications.
                      • GPUs, originally developed for graphics and video games, have played a significant role in advancing deep learning due to their compatibility with matrix multiplications.
                      • To train a neural network, data is typically split into training, validation, and test sets to avoid overfitting and improve model performance.
                      • Pre-training involves training a large model on extensive data, which can then be fine-tuned using smaller sets of specialized data for better performance.
                      • Model hubs, such as Hugging Face, offer numerous pre-trained models for various machine learning tasks and have seen significant growth in recent years.
                      • The Transformer model has become the dominant architecture for a wide range of machine learning tasks.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#the-transformer-architecture","title":"The Transformer Architecture","text":"
                      • Transformer architecture introduced in 2017 paper \"Attention is All You Need\"
                      • Set state-of-the-art results in translation tasks
                      • Applied to other NLP tasks and fields like vision
                      • Appears complicated but consists of two similar halves
                      • Focusing on one half called the decoder
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#transformer-decoder-overview","title":"Transformer Decoder Overview","text":"
                      • The task of the Transformer decoder is to complete text, much like GPT models.
                      • The input consists of a sequence of tokens (e.g., \"it's a blue\"), and the goal is to predict the next word (e.g., \"sundress\").
                      • The output is a probability distribution over potential next tokens.
                      • Inference involves sampling a token from the distribution, appending it to the input, and running the model again with the updated input.
                      • ChatGPT operates by seeing user input, sampling the next word, appending it, and repeating this process.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#inputs","title":"Inputs","text":"
                      • Inputs need to be vectors of numbers
                      • Text is turned into vectors through tokenization
                      • Tokens are assigned an ID in a vocabulary, rather than being words
                      • Numbers are represented as vectors using one-hot encoding (e.g., number 3 represented by a vector with 1 in third position, zeros everywhere else)
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#input-embedding","title":"Input Embedding","text":"
                      • One-hot vectors are not good representations of words or tokens as they don't capture the notion of similarity between words
                      • To address the issue, we use embedding
                      • Embedding involves learning an embedding matrix which converts a one-hot vocabulary encoding into a dense vector of chosen dimensionalities
                      • This process turns words into dense embeddings, making it the simplest neural network layer type
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#masked-multi-head-attention","title":"Masked Multi-Head Attention","text":"
                      • Attention was introduced in 2015 for translation tasks, and the idea is to predict the most likely next token based on the importance of previous tokens.
                      • Attention mechanism involves an output as a weighted sum of input vectors, and these weights are calculated using dot products (similarities) between the input vectors.
                      • Each input vector plays three roles in the attention mechanism: as a query, key, and value.
                      • To learn and improve attention, input vectors can be projected into different roles (query, key, and value) by multiplying them with learnable matrices.
                      • Multi-head attention refers to learning several different ways of transforming inputs into queries, keys, and values simultaneously.
                      • Masking is used to prevent the model from \"cheating\" by considering future tokens; it ensures that the model only predicts the next token based on the already seen input.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#positional-encoding","title":"Positional Encoding","text":"
                      • No notion of position in the current model, only whether something has been seen or not.
                      • Positional encoding is introduced to provide ordering among the seen elements.
                      • Current equations resemble a bag of unordered items.
                      • Positional encoding vectors are added to embedding vectors to provide order.
                      • Seems counterintuitive, but it works; attention mechanism figures out relevant positions.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#skip-connections-and-layer-norm","title":"Skip Connections and Layer Norm","text":"
                      • Add up and norm attention outputs using skip connections and layer normalization
                      • Skip connections help propagate loss from end to beginning of model during backpropagation
                      • Layer normalization resets mean and standard deviation to uniform after every operation
                      • Input embedding determines the dimension of the entire Transformer model
                      • Normalization seems inelegant but is very effective in improving neural net learning
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#feed-forward-layer","title":"Feed-forward Layer","text":"
                      • Feed forward layer is similar to the standard multi-layer perceptron.
                      • It receives tokens augmented with relevant information.
                      • The layer upgrades the token representation.
                      • The process goes from word-level to thought-level, with more semantic meaning.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#transformer-hyperparameters-and-why-they-work-so-well","title":"Transformer hyperparameters and Why they work so well","text":"
                      • GPT-3 model ranges from 12 to 96 layers of Transformer layers with adjustable embedding dimensions and attention heads, totaling 175 billion parameters.
                      • Most of GPT-3's parameters are in the feed forward layer, but for smaller models, a significant portion is in embedding and attention.
                      • Transformers are effective general-purpose differentiable computers that are expressive, optimizable via backpropagation, and efficient due to parallel processing.
                      • Understanding exact expressiveness of the Transformer is ongoing, with interesting results like RASP (a programming language designed to be implemented within a Transformer).
                      • Decompiling Transformer weights back to a program is still an unsolved problem.
                      • Multiple attention heads allow the model to figure out how to use a second head, showcased in work like Induction Heads.
                      • Learning to code Transformers isn't necessary for AI-powered products, but can be fun and educational. Resources like YouTube tutorials and code examples are available to assist in learning.
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#notable-llm-bert","title":"Notable LLM: BERT","text":"
                      • Bert, T5, and GPT cover the gamut of large Transformer models
                      • Bert stands for bi-directional encoder representation from Transformers
                      • Bert uses the encoder part of the Transformer, with unmasked attention
                      • Bert contains 100 million parameters, considered large at its time
                      • Bert was trained by masking 15% of words in a text corpus and predicting the masked words
                      • Bert became a building block for other NLP applications
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#notable-llm-t5","title":"Notable LLM: T5","text":"
                      • T5 applies Transformer architecture to text-to-text transfer, meaning both input and output are text strings
                      • The task is encoded in the input string and can involve translation, summarization, etc.
                      • Encoder-decoder architecture was found to be best, with 11 billion parameters
                      • Trained on Colossal Queen Crawl Corpus (C4) derived from Common Crawl dataset
                      • C4 was created by filtering out short pages, offensive content, pages with code, and de-duplicating data
                      • Fine-tuned using academic supervised tasks for various NLP applications
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#notable-llm-gpt","title":"Notable LLM: GPT","text":"
                      • GPT is a generative pre-trained Transformer, with GPT-2 being decoder only
                      • GPT-2 was trained on a dataset called WebText created by scraping links from Reddit
                      • GPT tokenizes text using byte pair encoding, a middle ground between old-school tokenization and using UTF-8 bytes
                      • GPT-3 came out in 2020 and is 100 times larger than GPT-2, enabling few-shot and zero-shot learning
                      • GPT-3 was trained on webtext, raw common crawl data, a selection of books, and all of Wikipedia
                      • The dataset for GPT-3 contained 500 billion tokens, but it was only trained on 300 billion tokens
                      • GPT-4 details are unknown, but it is assumed to be much larger than previous versions due to the trend in increasing size
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#notable-llm-chinchilla-and-scaling-laws","title":"Notable LLM: Chinchilla and Scaling Laws","text":"
                      • Using more computation to train AI systems improves their performance
                      • Rich Sutton's \"bitter lesson\": advantage goes to those stacking more layers
                      • DeepMind's paper, Training Compute Optimal LLMs: studied relationship between model size, compute and data set size
                      • Most LLMs in literature had too many parameters for their data amount
                      • Chinchilla model (70 billion) outperformed Gopher model (four times larger) by training on 1.4 trillion tokens instead of 300 billion
                      • Open question: can models continue to improve by training repeatedly on existing data?
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#notable-llm-llama","title":"Notable LLM: LLaMA","text":"
                      • Llama is an open-source chinchilla optimal LLM from Meta Research
                      • Several sizes available, ranging from 7 billion to 65 billion, with at least 1 trillion tokens
                      • Competitively benchmarks against GPT-3 and other state-of-the-art LLMs
                      • Open source but non-commercial license for pre-trained weights
                      • Trained on custom common crawl filtering, C4, GitHub, Wikipedia, books, and scientific papers
                      • Data set replicated by Red Pajama, which is also training models to replicate Llama
                      • Interesting inclusion of GitHub as a training resource
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#why-include-code-in-llm-training-data","title":"Why include code in LLM training data?","text":"
                      • Including code in training data can improve performance on non-code tasks
                      • OpenAI found this with their Codex model, which was fine-tuned on code and outperformed GPT-3 on reasoning tasks
                      • Since then, people have been adding code to training data
                      • Open source dataset called 'the stack' collects code from GitHub while respecting licenses
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#instruction-tuning","title":"Instruction Tuning","text":"
                      • Discusses instruction tuning in GPT models and its impact on performance
                      • Mentions the shift from text completion mindset to instruction following mindset
                      • Supervised fine-tuning helps models become better at zero-shot tasks by using data sets of zero-shot inputs and desired outputs
                      • OpenAI hired thousands of contractors to gather zero-shot data and used reinforcement learning for training
                      • GPT model lineage includes DaVinci, Codex, and various iterations, fine-tuning for specific applications
                      • Fine-tuning imposes an \"alignment tax,\" decreasing few-shot learning ability and model's confidence calibration
                      • Llama model by Stanford team used GPT-3 generated instructions, costing less but with reduced performance compared to GPT-3
                      • A specific data set for instruction tuning in chat-based paradigms is called \"Open Assistant\"
                      "},{"location":"llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/#notable-llm-retro","title":"Notable LLM: RETRO","text":"
                      • Discussing a model called \"retrieval enhancing\" from DeepMind
                      • Goal: train a smaller model good at reasoning and writing code, but looks up facts from a database
                      • Used \"burden-coded\" sentences in a trillion-token database for fact retrieval
                      • Not as effective as large language models yet, but shows potential for the future
                      "},{"location":"llm-bootcamp/spring-2023/llmops/","title":"LLMOps","text":"

                      Lecture by Josh Tobin. Published May 9, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/llmops/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/llmops/#why-llmops","title":"Why LLMOps?","text":"
                      • Topic of lecture core to whole ethos of full stack deep learning
                      • Started five years ago in AI hype cycle focusing on deep learning
                      • Classes teach about building with neural networks, but not getting into production
                      • Philosophy carried throughout the development of courses
                      • Focus on building applications with language models and considerations for production systems
                      • Space for real production systems with language models is underdeveloped
                      • Lecture will cover assorted topics related to building these applications
                      • Provide high-level pointers, initial choices, and resources for learning more
                      • Aim to tie topics together into a first-pass theory for \"LLMops\"
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#choosing-your-base-llm","title":"Choosing your base LLM","text":"
                      • Building an application on top of LLMs requires choosing which model to use; the best model depends on trade-offs, such as quality, speed, cost, tunability, and data security.
                      • For most use cases, GPT4 is a good starting point.
                      • Proprietary models, like GPT4 and Anthropic, are usually higher quality, but open source models offer more customization and better data security.
                      • Consider licensing when choosing an open source model: permissive licenses (e.g., Apache 2.0) offer more freedom, whereas restricted licenses limit commercial use.
                      • Be cautious with \"open source\" models released under non-commercial licenses, as they restrict commercial use and may not truly be open source.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#proprietary-llms","title":"Proprietary LLMs","text":"
                      • Discussed proprietary models and ranked them using criteria: number of parameters, size of context window, type of training data, subjective quality score, speed of inference, and fine-tunability.
                      • Number of parameters and training data are proxies for model quality; context window crucial for model usefulness in downstream applications.
                      • Four types of training data: diverse, code, instructions, and human feedback; few models use all four types.
                      • Quality best determined using benchmarks and hands-on evaluations.
                      • GPT-4 recognized as the highest quality model, followed by GPT-3.5 for a faster and cheaper option.
                      • Claude from Anthropic and Cohere's largest model compete for quality and fine-tunability.
                      • For a trade-off of quality in favor of speed and cost, consider Anthropic's offering or alternatives from OpenAI and Cohere.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#open-source-llms","title":"Open-source LLMs","text":"
                      • Large language models have both proprietary and open-source options
                      • Open-source options include T5, Flan T5, Pythia, Dolly, Stable-LM, Llama, Alpaca, Vicuna, Koala, and Opt
                      • T5 and Flan-T5 have permissive licenses while other options may have license restrictions
                      • Llama ecosystem is well-supported by the community, but not ideal for production
                      • Benchmarks can mislead, assess language model performance on specific tasks
                      • Start projects with GPT-4 to prototype, downsize to GPT-3.5 or Claude if cost/latency is a concern
                      • Cohere is the best for fine-tuning among commercial providers
                      • Open-source may catch up with GPT-3.5 level performance by the end of the year
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#iteration-and-prompt-management","title":"Iteration and prompt management","text":"
                      • I believe prompt engineering is currently missing tools to make it more like engineering and less like ad hoc experimentation.
                      • Experiment management was impactful in the deep learning world because experiments took a long time to run and there were many parallel experiments, which prompt engineering typically doesn't have.
                      • I suggest three levels of tracking experiments with prompts and chains: 1) Doing nothing and using OpenAI Playground, 2) Tracking prompts in Git, and 3) Using specialized tracking tools for prompts (if necessary).
                      • Most teams should use Git for tracking as it's easy and fits into their current workflows.
                      • Specialized prompt tracking tools should be decoupled from Git and provide a UI for non-technical stakeholders.
                      • Keep an eye out for new tools in this space, as it's rapidly evolving with recent announcements from major providers like Weights & Biases, Comet, and MLflow.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#testing-llms-why-and-why-is-it-hard","title":"Testing LLMs: Why and why is it hard?","text":"
                      • To ensure changes to a model or prompt are effective, measure performance on a wide range of data representing end-user inputs.
                      • User retention for AI-powered applications depends on trust and reliable output.
                      • Traditional machine learning model testing involves training sets, held-out data, and measuring accuracy, but language models present unique challenges:
                      • You don't know the training data used by API providers like OpenAI.
                      • Production distribution is always different than training distribution.
                      • Metrics are less straightforward and might not capture the diverse behaviors of the model.
                      • Language models require a more diverse understanding of behaviors and qualitative output measurement.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#testing-llms-what-works","title":"Testing LLMs: What works?","text":"
                      • Two key questions for testing language models: what data to test them on and what metrics to compute on that data
                      • Build evaluation sets incrementally, starting from the beginning while prototyping the model
                      • Add interesting examples to the dataset, focusing on hard examples where the model struggles and different examples that aren't common in the dataset
                      • Utilize the language model to help generate diverse test cases by creating prompts for the tasks you're trying to solve
                      • As the model rolls out to more users, keep adding data to the dataset, considering user dislikes and underrepresented topics for inclusion
                      • Consider the concept of test coverage, aiming for an evaluation set that covers the types of tasks users will actually perform in the system
                      • Test coverage and distribution shift are analogous, but measure different aspects of data relationships
                      • To be effective, test reliability should measure the difference between online and offline performance, ensuring that metrics are relevant to real-world user experiences.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#evaluation-metrics-for-llms","title":"Evaluation metrics for LLMs","text":"
                      • Evaluation metrics for language models depend on the availability of a correct answer, reference answer, previous answer, or human feedback.
                      • If there's a correct answer, use metrics like accuracy.
                      • With a reference answer, employ reference matching metrics like semantic similarity or factual consistency.
                      • If there's a previous answer, ask another language model which answer is better.
                      • When human feedback is available, check if the answer incorporates the feedback.
                      • If none of these options apply, verify output structure or ask the model to grade the answer.
                      • Although automatic evaluation is desirable for faster experimentation, manual checks still play an essential role.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#deployment-and-monitoring","title":"Deployment and monitoring","text":"
                      • Deploying LLM (Language Model) APIs can be simple, but becomes more complex if there's a lot of logic behind API calls.
                      • Techniques to improve LLM output quality include self-critique, sampling multiple outputs, and ensemble techniques.
                      • Monitoring LLMs involves looking at user satisfaction and defining performance metrics, like response length or common issues in production.
                      • Gather user feedback via low friction methods, such as thumbs up/down or short messages.
                      • Common issues with LLMs in production include UI problems, latency, incorrect answers, long-winded responses, and prompt injection attacks.
                      • Use user feedback to improve prompts by finding and addressing themes or problems.
                      • Fine-tuning LLMs can be done through supervised fine-tuning or human feedback, though the latter is more challenging.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/#test-driven-development-for-llms","title":"Test-driven development for LLMs","text":"
                      • Rapidly evolving field with no established best practices yet
                      • Aim to provide main questions and resources for building applications with LLMS
                      • Introduce a potential structured process: test-driven or behavior-driven development
                      • Main components of process are prompt/chain development, deployment, user feedback, and logging/monitoring
                      • Use interaction data from user feedback to improve model, extract test data, and iterate on prompts
                      • As complexity increases, consider fine-tuning workflow with additional training data
                      • Virtuous cycle of improvement as interaction data from users increases and informs subsequent iterations
                      • Process repeats with individual developer, team, and end-users involved in feedback and improvements
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#why-llmops","title":"Why LLMOps?","text":"
                      • Topic of lecture core to whole ethos of full stack deep learning
                      • Started five years ago in AI hype cycle focusing on deep learning
                      • Classes teach about building with neural networks, but not getting into production
                      • Philosophy carried throughout the development of courses
                      • Focus on building applications with language models and considerations for production systems
                      • Space for real production systems with language models is underdeveloped
                      • Lecture will cover assorted topics related to building these applications
                      • Provide high-level pointers, initial choices, and resources for learning more
                      • Aim to tie topics together into a first-pass theory for \"LLMops\"
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#choosing-your-base-llm","title":"Choosing your base LLM","text":"
                      • Building an application on top of LLMs requires choosing which model to use; the best model depends on trade-offs, such as quality, speed, cost, tunability, and data security.
                      • For most use cases, GPT4 is a good starting point.
                      • Proprietary models, like GPT4 and Anthropic, are usually higher quality, but open source models offer more customization and better data security.
                      • Consider licensing when choosing an open source model: permissive licenses (e.g., Apache 2.0) offer more freedom, whereas restricted licenses limit commercial use.
                      • Be cautious with \"open source\" models released under non-commercial licenses, as they restrict commercial use and may not truly be open source.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#proprietary-llms","title":"Proprietary LLMs","text":"
                      • Discussed proprietary models and ranked them using criteria: number of parameters, size of context window, type of training data, subjective quality score, speed of inference, and fine-tunability.
                      • Number of parameters and training data are proxies for model quality; context window crucial for model usefulness in downstream applications.
                      • Four types of training data: diverse, code, instructions, and human feedback; few models use all four types.
                      • Quality best determined using benchmarks and hands-on evaluations.
                      • GPT-4 recognized as the highest quality model, followed by GPT-3.5 for a faster and cheaper option.
                      • Claude from Anthropic and Cohere's largest model compete for quality and fine-tunability.
                      • For a trade-off of quality in favor of speed and cost, consider Anthropic's offering or alternatives from OpenAI and Cohere.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#open-source-llms","title":"Open-source LLMs","text":"
                      • Large language models have both proprietary and open-source options
                      • Open-source options include T5, Flan T5, Pythia, Dolly, Stable-LM, Llama, Alpaca, Vicuna, Koala, and Opt
                      • T5 and Flan-T5 have permissive licenses while other options may have license restrictions
                      • Llama ecosystem is well-supported by the community, but not ideal for production
                      • Benchmarks can mislead, assess language model performance on specific tasks
                      • Start projects with GPT-4 to prototype, downsize to GPT-3.5 or Claude if cost/latency is a concern
                      • Cohere is the best for fine-tuning among commercial providers
                      • Open-source may catch up with GPT-3.5 level performance by the end of the year
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#iteration-and-prompt-management","title":"Iteration and prompt management","text":"
                      • I believe prompt engineering is currently missing tools to make it more like engineering and less like ad hoc experimentation.
                      • Experiment management was impactful in the deep learning world because experiments took a long time to run and there were many parallel experiments, which prompt engineering typically doesn't have.
                      • I suggest three levels of tracking experiments with prompts and chains: 1) Doing nothing and using OpenAI Playground, 2) Tracking prompts in Git, and 3) Using specialized tracking tools for prompts (if necessary).
                      • Most teams should use Git for tracking as it's easy and fits into their current workflows.
                      • Specialized prompt tracking tools should be decoupled from Git and provide a UI for non-technical stakeholders.
                      • Keep an eye out for new tools in this space, as it's rapidly evolving with recent announcements from major providers like Weights & Biases, Comet, and MLflow.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#testing-llms-why-and-why-is-it-hard","title":"Testing LLMs: Why and why is it hard?","text":"
                      • To ensure changes to a model or prompt are effective, measure performance on a wide range of data representing end-user inputs.
                      • User retention for AI-powered applications depends on trust and reliable output.
                      • Traditional machine learning model testing involves training sets, held-out data, and measuring accuracy, but language models present unique challenges:
                      • You don't know the training data used by API providers like OpenAI.
                      • Production distribution is always different than training distribution.
                      • Metrics are less straightforward and might not capture the diverse behaviors of the model.
                      • Language models require a more diverse understanding of behaviors and qualitative output measurement.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#testing-llms-what-works","title":"Testing LLMs: What works?","text":"
                      • Two key questions for testing language models: what data to test them on and what metrics to compute on that data
                      • Build evaluation sets incrementally, starting from the beginning while prototyping the model
                      • Add interesting examples to the dataset, focusing on hard examples where the model struggles and different examples that aren't common in the dataset
                      • Utilize the language model to help generate diverse test cases by creating prompts for the tasks you're trying to solve
                      • As the model rolls out to more users, keep adding data to the dataset, considering user dislikes and underrepresented topics for inclusion
                      • Consider the concept of test coverage, aiming for an evaluation set that covers the types of tasks users will actually perform in the system
                      • Test coverage and distribution shift are analogous, but measure different aspects of data relationships
                      • To be effective, test reliability should measure the difference between online and offline performance, ensuring that metrics are relevant to real-world user experiences.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#evaluation-metrics-for-llms","title":"Evaluation metrics for LLMs","text":"
                      • Evaluation metrics for language models depend on the availability of a correct answer, reference answer, previous answer, or human feedback.
                      • If there's a correct answer, use metrics like accuracy.
                      • With a reference answer, employ reference matching metrics like semantic similarity or factual consistency.
                      • If there's a previous answer, ask another language model which answer is better.
                      • When human feedback is available, check if the answer incorporates the feedback.
                      • If none of these options apply, verify output structure or ask the model to grade the answer.
                      • Although automatic evaluation is desirable for faster experimentation, manual checks still play an essential role.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#deployment-and-monitoring","title":"Deployment and monitoring","text":"
                      • Deploying LLM (Language Model) APIs can be simple, but becomes more complex if there's a lot of logic behind API calls.
                      • Techniques to improve LLM output quality include self-critique, sampling multiple outputs, and ensemble techniques.
                      • Monitoring LLMs involves looking at user satisfaction and defining performance metrics, like response length or common issues in production.
                      • Gather user feedback via low friction methods, such as thumbs up/down or short messages.
                      • Common issues with LLMs in production include UI problems, latency, incorrect answers, long-winded responses, and prompt injection attacks.
                      • Use user feedback to improve prompts by finding and addressing themes or problems.
                      • Fine-tuning LLMs can be done through supervised fine-tuning or human feedback, though the latter is more challenging.
                      "},{"location":"llm-bootcamp/spring-2023/llmops/chapter_summaries/#test-driven-development-for-llms","title":"Test-driven development for LLMs","text":"
                      • Rapidly evolving field with no established best practices yet
                      • Aim to provide main questions and resources for building applications with LLMS
                      • Introduce a potential structured process: test-driven or behavior-driven development
                      • Main components of process are prompt/chain development, deployment, user feedback, and logging/monitoring
                      • Use interaction data from user feedback to improve model, extract test data, and iterate on prompts
                      • As complexity increases, consider fine-tuning workflow with additional training data
                      • Virtuous cycle of improvement as interaction data from users increases and informs subsequent iterations
                      • Process repeats with individual developer, team, and end-users involved in feedback and improvements
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/","title":"Learn to Spell: Prompt Engineering","text":"

                      Lecture by Charles Frye. Published May 9, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#intro","title":"Intro","text":"
                      • Dive into technical skills for using language models
                      • Focus on prompt engineering: designing text input to get desired behavior from language models
                      • Language models replace traditional training and fine-tuning techniques in machine learning
                      • Programming language models is like programming in English instead of coding languages
                      • High-level intuitions for prompt engineering: prompts as magic spells
                      • Discuss emerging playbook for effective prompting, including techniques to get desired output from language models
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#language-models-are-statistical-models-of-text","title":"Language models are statistical models of text","text":"
                      • Prompts are not literal magic spells; they are based on linear algebra.
                      • Language models are statistical models of text, similar to how a bell curve is a statistical model of data.
                      • Language models are trained by going through text and predicting the probability of the next word, which is called an auto-regressive model.
                      • These models start with random weights, eventually learning to assign high probabilities to text that resembles real-world samples.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#but-statistical-model-gives-bad-intuition","title":"But \"statistical model\" gives bad intuition","text":"
                      • Language models can be thought of as statistical pattern matchers, but this can also give bad intuitions.
                      • Traditional simple statistical models, like linear regression, are not the best way to think about language models.
                      • A better intuition comes from probabilistic programs, which allow manipulation of random variables and can represent complex statistics.
                      • Probabilistic programs can be represented by graphical models, providing insight into complex text models.
                      • The Language Model Cascades paper by Dohan et al. dives into detail on probabilistic programs and their applications to language models.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#prompts-are-magic-spells","title":"Prompts are magic spells","text":"
                      • Drawing inspiration from Arthur C Clarke's laws of technology, which suggests that advanced technology is similar to magic
                      • Prompts are like magic spells, using words to achieve impossible effects but requiring complex rules
                      • Spending too much time learning these complex rules can negatively impact mental health
                      • Three magical intuitions for using prompts:
                      • Pre-trained models (e.g. GPT-3, Llama) - prompts are portals to alternate universes
                      • Instruction-tuned models (e.g. ChatGPT, Alpaca) - prompts are used to make wishes
                      • Agent simulation (latest language models) - prompts create a Golem
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#prompts-are-portals-to-alternate-universes","title":"Prompts are portals to alternate universes","text":"
                      • The language model creates a portal to an alternate universe where desired documents exist by weighting all possible documents based on their probability.
                      • The primary goal of prompting is subtractive; it focuses the mass of predictions to hone in on a specific world by conditioning the probabilistic model.
                      • The language model can generate text from nearby universes for similarities, but cannot provide specific or novel information from another universe (e.g., a cure for cancer).
                      • The model can help find ideas and documents similar to existing ones or combine ideas that haven't been combined yet.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#a-prompt-can-make-a-wish-come-true","title":"A prompt can make a wish come true","text":"
                      • Core intuition: Language models shape and sculpt from the set of all possible documents and universes; Instruction-tuned models (like ChatGPT) can respond to wishes and commands.
                      • An example of overcoming bias: Asking the model to ensure answers are unbiased and do not rely on stereotypes greatly improves performance.
                      • Be precise when prompting language models and learn the rules the \"genie\" operates by.
                      • Suggestions to improve instructional prompts:
                      • Simplify and focus on low-level patterns of text rather than conversational complexity.
                      • Turn descriptions into bulleted lists; language models tend to only focus on the beginning of descriptions.
                      • Replace negation statements with assertions (e.g., instead of \"don't be stereotyped,\" say, \"please ensure your answer does not rely on stereotypes\").
                      • Instruction fine-tuned models are essentially like annotators with no context; treat them as such for better performance.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#a-prompt-can-create-a-golem","title":"A prompt can create a golem","text":"
                      • Large language models can create \"golems\" or artificial agents with specific personas, similar to the golem creature from Jewish folklore
                      • Personas in language models can help improve performance on tasks like translations by putting the model into a situational context
                      • People have created models with detailed personas in various settings, including video game worlds
                      • Language models become better by internally modeling processes that produce text, such as understanding the context and environment in which utterances are made
                      • Natural language processing faces challenges with large language models as they may lack communicative intentions, which humans naturally have
                      • By designing prompts carefully, one can get a language model to simulate agents, improving its predictions and understanding of context.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#limitations-of-llms-as-simulators","title":"Limitations of LLMs as simulators","text":"
                      • Our universal simulators are trained on text humans have written, not on all data or states of the universe.
                      • Simulations will be related to human-written data, like fictional super intelligences (e.g. HAL 9000), not actual super intelligent AIs.
                      • Language models can simulate human thinking well for short timeframes (e.g. reactions to social media posts), but struggle for longer periods and personal contexts.
                      • Models can perform well in simulating fictional personas and can approximate calculators or interpreters, but cannot replace them or access live data.
                      • Wherever possible, replace weak simulators with the real deal (e.g. run Python code in an actual kernel).
                      • Pre-trained models are mostly alternate universe document generators, and can be agent simulators with varying quality depending on the model and agent.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#prompting-techniques-are-mostly-tricks","title":"Prompting techniques are mostly tricks","text":"
                      • This section focuses on prompt engineering tricks and techniques.
                      • Many prompt engineering papers can actually be summarized in a few sentences, but they include pages of benchmark marketing.
                      • There isn't much depth to these tricks, unlike the core language modeling aspect which has mathematical depth.
                      • Two things to be cautious of: few-shot learning as an approach and potential issues with tokenization.
                      • I will discuss some misconceptions and provide tips for handling these issues.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#few-shot-learning-isnt-the-right-model-for-prompting","title":"Few-shot learning isn't the right model for prompting","text":"
                      • Language models like GPT-3 can learn tasks from prompts, but it was unclear if they would actually be useful.
                      • The GPT-3 paper called these models \"few-shot learners\" and showed they can learn tasks like math and translation.
                      • However, the model often struggles to move away from pre-training knowledge.
                      • For example, GPT-3 tends to ignore permuted labels for sentiment analysis and sticks to its original understanding.
                      • Latest language models can handle permuted labels, but not perfectly, and require many examples to accomplish this.
                      • Treating the prompt as a way to do few-shot learning might not be an ideal approach.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#character-level-operations-are-hard","title":"Character-level operations are hard","text":"
                      • Models see tokens, not characters; they struggle with tasks like rotating and reversing words
                      • Adding spaces between letters can change tokenization and improve performance
                      • GPT-4 can handle some challenges (e.g. summary with words starting with G) but still has limitations
                      • For tasks like string manipulation, it's better to use traditional programming instead of language models
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/#the-prompting-playbook-reasoning-reflection-ensembling","title":"The prompting playbook: reasoning, reflection, & ensembling","text":"
                      • Language models perform well with formatted text; using structured text like pseudocode can improve results
                      • Decompose tasks into smaller pieces in your prompt to make the language model generate each piece; automate decomposition for better performance
                      • Elicit reasoning capabilities from the model by carefully tuning the prompt, such as using \"Let's think step-by-step\"
                      • Ensemble results of multiple models for more accurate answers and use randomness for greater heterogeneity in responses
                      • Combine prompting techniques (e.g., few-shot, Chain of Thought, ensembling) to increase performance, but be mindful of the impact on latency and compute costs
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#intro","title":"Intro","text":"
                      • Dive into technical skills for using language models
                      • Focus on prompt engineering: designing text input to get desired behavior from language models
                      • Language models replace traditional training and fine-tuning techniques in machine learning
                      • Programming language models is like programming in English instead of coding languages
                      • High-level intuitions for prompt engineering: prompts as magic spells
                      • Discuss emerging playbook for effective prompting, including techniques to get desired output from language models
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#language-models-are-statistical-models-of-text","title":"Language models are statistical models of text","text":"
                      • Prompts are not literal magic spells; they are based on linear algebra.
                      • Language models are statistical models of text, similar to how a bell curve is a statistical model of data.
                      • Language models are trained by going through text and predicting the probability of the next word, which is called an auto-regressive model.
                      • These models start with random weights, eventually learning to assign high probabilities to text that resembles real-world samples.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#but-statistical-model-gives-bad-intuition","title":"But \"statistical model\" gives bad intuition","text":"
                      • Language models can be thought of as statistical pattern matchers, but this can also give bad intuitions.
                      • Traditional simple statistical models, like linear regression, are not the best way to think about language models.
                      • A better intuition comes from probabilistic programs, which allow manipulation of random variables and can represent complex statistics.
                      • Probabilistic programs can be represented by graphical models, providing insight into complex text models.
                      • The Language Model Cascades paper by Dohan et al. dives into detail on probabilistic programs and their applications to language models.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#prompts-are-magic-spells","title":"Prompts are magic spells","text":"
                      • Drawing inspiration from Arthur C Clarke's laws of technology, which suggests that advanced technology is similar to magic
                      • Prompts are like magic spells, using words to achieve impossible effects but requiring complex rules
                      • Spending too much time learning these complex rules can negatively impact mental health
                      • Three magical intuitions for using prompts:
                      • Pre-trained models (e.g. GPT-3, Llama) - prompts are portals to alternate universes
                      • Instruction-tuned models (e.g. ChatGPT, Alpaca) - prompts are used to make wishes
                      • Agent simulation (latest language models) - prompts create a Golem
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#prompts-are-portals-to-alternate-universes","title":"Prompts are portals to alternate universes","text":"
                      • The language model creates a portal to an alternate universe where desired documents exist by weighting all possible documents based on their probability.
                      • The primary goal of prompting is subtractive; it focuses the mass of predictions to hone in on a specific world by conditioning the probabilistic model.
                      • The language model can generate text from nearby universes for similarities, but cannot provide specific or novel information from another universe (e.g., a cure for cancer).
                      • The model can help find ideas and documents similar to existing ones or combine ideas that haven't been combined yet.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#a-prompt-can-make-a-wish-come-true","title":"A prompt can make a wish come true","text":"
                      • Core intuition: Language models shape and sculpt from the set of all possible documents and universes; Instruction-tuned models (like ChatGPT) can respond to wishes and commands.
                      • An example of overcoming bias: Asking the model to ensure answers are unbiased and do not rely on stereotypes greatly improves performance.
                      • Be precise when prompting language models and learn the rules the \"genie\" operates by.
                      • Suggestions to improve instructional prompts:
                      • Simplify and focus on low-level patterns of text rather than conversational complexity.
                      • Turn descriptions into bulleted lists; language models tend to only focus on the beginning of descriptions.
                      • Replace negation statements with assertions (e.g., instead of \"don't be stereotyped,\" say, \"please ensure your answer does not rely on stereotypes\").
                      • Instruction fine-tuned models are essentially like annotators with no context; treat them as such for better performance.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#a-prompt-can-create-a-golem","title":"A prompt can create a golem","text":"
                      • Large language models can create \"golems\" or artificial agents with specific personas, similar to the golem creature from Jewish folklore
                      • Personas in language models can help improve performance on tasks like translations by putting the model into a situational context
                      • People have created models with detailed personas in various settings, including video game worlds
                      • Language models become better by internally modeling processes that produce text, such as understanding the context and environment in which utterances are made
                      • Natural language processing faces challenges with large language models as they may lack communicative intentions, which humans naturally have
                      • By designing prompts carefully, one can get a language model to simulate agents, improving its predictions and understanding of context.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#limitations-of-llms-as-simulators","title":"Limitations of LLMs as simulators","text":"
                      • Our universal simulators are trained on text humans have written, not on all data or states of the universe.
                      • Simulations will be related to human-written data, like fictional super intelligences (e.g. HAL 9000), not actual super intelligent AIs.
                      • Language models can simulate human thinking well for short timeframes (e.g. reactions to social media posts), but struggle for longer periods and personal contexts.
                      • Models can perform well in simulating fictional personas and can approximate calculators or interpreters, but cannot replace them or access live data.
                      • Wherever possible, replace weak simulators with the real deal (e.g. run Python code in an actual kernel).
                      • Pre-trained models are mostly alternate universe document generators, and can be agent simulators with varying quality depending on the model and agent.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#prompting-techniques-are-mostly-tricks","title":"Prompting techniques are mostly tricks","text":"
                      • This section focuses on prompt engineering tricks and techniques.
                      • Many prompt engineering papers can actually be summarized in a few sentences, but they include pages of benchmark marketing.
                      • There isn't much depth to these tricks, unlike the core language modeling aspect which has mathematical depth.
                      • Two things to be cautious of: few-shot learning as an approach and potential issues with tokenization.
                      • I will discuss some misconceptions and provide tips for handling these issues.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#few-shot-learning-isnt-the-right-model-for-prompting","title":"Few-shot learning isn't the right model for prompting","text":"
                      • Language models like GPT-3 can learn tasks from prompts, but it was unclear if they would actually be useful.
                      • The GPT-3 paper called these models \"few-shot learners\" and showed they can learn tasks like math and translation.
                      • However, the model often struggles to move away from pre-training knowledge.
                      • For example, GPT-3 tends to ignore permuted labels for sentiment analysis and sticks to its original understanding.
                      • Latest language models can handle permuted labels, but not perfectly, and require many examples to accomplish this.
                      • Treating the prompt as a way to do few-shot learning might not be an ideal approach.
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#character-level-operations-are-hard","title":"Character-level operations are hard","text":"
                      • Models see tokens, not characters; they struggle with tasks like rotating and reversing words
                      • Adding spaces between letters can change tokenization and improve performance
                      • GPT-4 can handle some challenges (e.g. summary with words starting with G) but still has limitations
                      • For tasks like string manipulation, it's better to use traditional programming instead of language models
                      "},{"location":"llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/#the-prompting-playbook-reasoning-reflection-ensembling","title":"The prompting playbook: reasoning, reflection, & ensembling","text":"
                      • Language models perform well with formatted text; using structured text like pseudocode can improve results
                      • Decompose tasks into smaller pieces in your prompt to make the language model generate each piece; automate decomposition for better performance
                      • Elicit reasoning capabilities from the model by carefully tuning the prompt, such as using \"Let's think step-by-step\"
                      • Ensemble results of multiple models for more accurate answers and use randomness for greater heterogeneity in responses
                      • Combine prompting techniques (e.g., few-shot, Chain of Thought, ensembling) to increase performance, but be mindful of the impact on latency and compute costs
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/","title":"Reza Shabani: How to train your own LLM","text":"

                      Lecture by Reza Shabani. Published May 25, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#why-train-your-own-llms","title":"Why train your own LLMs?","text":"
                      • Topic of the lecture: how to train large language models
                      • Reasons for training your own models are customization, reduce dependency, cost efficiency, data privacy and control over updates
                      • Lecture covers the process of training Ghostwriter code completion model
                      • Ghostwriter is a competitor to Co-pilot, used for code generation
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#the-modern-llm-stack","title":"The Modern LLM Stack","text":"
                      • Replit uses Databricks for all of their data pipelines, including pre-processing, summary statistics, analytics transformations, and more.
                      • Replit also makes use of Hugging Face for data sets, pre-trained models, tokenizers, and inference tools.
                      • Mosaic ML is used for GPU nodes and model training, with pre-configured LLM configurations available.
                      • The process is divided into three stages: data processing, model training, and deployment/production.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#data-pipelines-databricks-hugging-face","title":"Data Pipelines: Databricks & Hugging Face","text":"
                      • The data pipeline starts with a large corpus of permissively licensed code data from The Stack.
                      • The data set comes from the GitHub archive and undergoes license filtering and near-deduplication.
                      • The data set contains programming languages in the hundreds.
                      • Databricks is used for processing and transformations, rather than Hugging Face tooling.
                      • Databricks allows for more control over the data and enables processing at scale.
                      • Proprietary data sources and data sets not on Hugging Face can be included in the training set.
                      • The process is tractable and extensible.
                      • Pre-processing steps are important in understanding the data set.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#preprocessing","title":"Preprocessing","text":"
                      • Anonymizing the data is an important pre-processing step, which involves removing emails, IP addresses, and secret keys.
                      • Auto-generated code and minified code are also removed using regexes and other heuristics.
                      • Code that doesn't compile or is not parsable is removed to remove bugs and improve model training.
                      • The team uses filters based on average line length, maximum line length, and percentage of alphanumeric characters.
                      • Metrics such as the number of GitHub stars or issues do not necessarily improve model quality.
                      • The team also trains its own tokenizer.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#tokenizer-training","title":"Tokenizer Training","text":"
                      • Tokenizers are made up of a tokenization algorithm and a vocabulary.
                      • Standard tokenizers are available on Hugging Face, but custom tokenizers can be trained on domain-specific data.
                      • A custom tokenizer can result in a smaller vocabulary, which speeds up model training and inference while capturing more relevant information.
                      • The tokenizer feeds back into the data pipeline and the training process, making it an integral part of the model.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#running-training-mosaicml-weights-biases","title":"Running Training: MosaicML, Weights & Biases","text":"
                      • Mosaic ML provides GPUs from multiple Cloud providers at reduced prices
                      • They have well-tuned LLM training configurations for specific models
                      • The manager infrastructure is fault-tolerant and has an easy-to-use CLI for training runs
                      • The speaker found using Mosaic ML worth it due to these benefits
                      • They use Weights & Biases for logging during training runs
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#testing-evaluation-humaneval-hugging-face","title":"Testing & Evaluation: HumanEval, Hugging Face","text":"
                      • Testing language models is difficult and time-consuming
                      • HumanEval is a common dataset for testing code generation models
                      • Hugging Face's code inference tool is useful for running tests quickly
                      • Running tests for multiple languages and certain tasks, like web completion, is more difficult
                      • Models need to be tested on unseen data to prevent bias
                      • Models can score well on tests but still not be practical or effective
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#deployment-fastertransformer-triton-server-k8s","title":"Deployment: FasterTransformer, Triton Server, k8s","text":"
                      • Deployment into production is a complex topic with many factors to consider
                      • Replit uses FasterTransformer and NVIDIA's Triton server for optimized performance
                      • Trton server allows for multiple model instances per GPU or multiple GPUs per model, with useful features like batching and request cancellation for reducing latency
                      • Auto-scaling infrastructure is used for running the models, but there are unique challenges for deployed models such as larger model sizes and specific GPU requirements
                      • Dealing with GPU shortages in individual zones is necessary
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#lessons-learned-data-centrism-eval-and-collaboration","title":"Lessons learned: data-centrism, eval, and collaboration","text":"
                      • Data is the most difficult part of the process
                      • Good pipelines are important for scalability and quick iteration
                      • Data is a critical factor in model quality and output
                      • Human evaluation and user testing are important for model vibes and usefulness
                      • Collaboration across the team is key to ensure all moving parts are working together
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/#what-makes-a-good-llm-engineer","title":"What makes a good LLM engineer?","text":"
                      • A good engineer in this field requires a mix of research and engineering mindset
                      • Working with data at scale is crucial, including the ability to move data into distributed pipelines
                      • A strong technical background in stats, computer science, algorithms, and data structures is important
                      • Skilled software development, including familiarity with libraries and frameworks like PyTorch is essential
                      • Engineers who appreciate and build in CI/CD help with the fast iteration loop for training models
                      • The replit team is hiring for these types of problems and welcomes interested applicants to speak with them about opportunities
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#why-train-your-own-llms","title":"Why train your own LLMs?","text":"
                      • Topic of the lecture: how to train large language models
                      • Reasons for training your own models are customization, reduce dependency, cost efficiency, data privacy and control over updates
                      • Lecture covers the process of training Ghostwriter code completion model
                      • Ghostwriter is a competitor to Co-pilot, used for code generation
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#the-modern-llm-stack","title":"The Modern LLM Stack","text":"
                      • Replit uses Databricks for all of their data pipelines, including pre-processing, summary statistics, analytics transformations, and more.
                      • Replit also makes use of Hugging Face for data sets, pre-trained models, tokenizers, and inference tools.
                      • Mosaic ML is used for GPU nodes and model training, with pre-configured LLM configurations available.
                      • The process is divided into three stages: data processing, model training, and deployment/production.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#data-pipelines-databricks-hugging-face","title":"Data Pipelines: Databricks & Hugging Face","text":"
                      • The data pipeline starts with a large corpus of permissively licensed code data from The Stack.
                      • The data set comes from the GitHub archive and undergoes license filtering and near-deduplication.
                      • The data set contains programming languages in the hundreds.
                      • Databricks is used for processing and transformations, rather than Hugging Face tooling.
                      • Databricks allows for more control over the data and enables processing at scale.
                      • Proprietary data sources and data sets not on Hugging Face can be included in the training set.
                      • The process is tractable and extensible.
                      • Pre-processing steps are important in understanding the data set.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#preprocessing","title":"Preprocessing","text":"
                      • Anonymizing the data is an important pre-processing step, which involves removing emails, IP addresses, and secret keys.
                      • Auto-generated code and minified code are also removed using regexes and other heuristics.
                      • Code that doesn't compile or is not parsable is removed to remove bugs and improve model training.
                      • The team uses filters based on average line length, maximum line length, and percentage of alphanumeric characters.
                      • Metrics such as the number of GitHub stars or issues do not necessarily improve model quality.
                      • The team also trains its own tokenizer.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#tokenizer-training","title":"Tokenizer Training","text":"
                      • Tokenizers are made up of a tokenization algorithm and a vocabulary.
                      • Standard tokenizers are available on Hugging Face, but custom tokenizers can be trained on domain-specific data.
                      • A custom tokenizer can result in a smaller vocabulary, which speeds up model training and inference while capturing more relevant information.
                      • The tokenizer feeds back into the data pipeline and the training process, making it an integral part of the model.
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#running-training-mosaicml-weights-biases","title":"Running Training: MosaicML, Weights & Biases","text":"
                      • Mosaic ML provides GPUs from multiple Cloud providers at reduced prices
                      • They have well-tuned LLM training configurations for specific models
                      • The manager infrastructure is fault-tolerant and has an easy-to-use CLI for training runs
                      • The speaker found using Mosaic ML worth it due to these benefits
                      • They use Weights & Biases for logging during training runs
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#testing-evaluation-humaneval-hugging-face","title":"Testing & Evaluation: HumanEval, Hugging Face","text":"
                      • Testing language models is difficult and time-consuming
                      • HumanEval is a common dataset for testing code generation models
                      • Hugging Face's code inference tool is useful for running tests quickly
                      • Running tests for multiple languages and certain tasks, like web completion, is more difficult
                      • Models need to be tested on unseen data to prevent bias
                      • Models can score well on tests but still not be practical or effective
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#deployment-fastertransformer-triton-server-k8s","title":"Deployment: FasterTransformer, Triton Server, k8s","text":"
                      • Deployment into production is a complex topic with many factors to consider
                      • Replit uses FasterTransformer and NVIDIA's Triton server for optimized performance
                      • Trton server allows for multiple model instances per GPU or multiple GPUs per model, with useful features like batching and request cancellation for reducing latency
                      • Auto-scaling infrastructure is used for running the models, but there are unique challenges for deployed models such as larger model sizes and specific GPU requirements
                      • Dealing with GPU shortages in individual zones is necessary
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#lessons-learned-data-centrism-eval-and-collaboration","title":"Lessons learned: data-centrism, eval, and collaboration","text":"
                      • Data is the most difficult part of the process
                      • Good pipelines are important for scalability and quick iteration
                      • Data is a critical factor in model quality and output
                      • Human evaluation and user testing are important for model vibes and usefulness
                      • Collaboration across the team is key to ensure all moving parts are working together
                      "},{"location":"llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/#what-makes-a-good-llm-engineer","title":"What makes a good LLM engineer?","text":"
                      • A good engineer in this field requires a mix of research and engineering mindset
                      • Working with data at scale is crucial, including the ability to move data into distributed pipelines
                      • A strong technical background in stats, computer science, algorithms, and data structures is important
                      • Skilled software development, including familiarity with libraries and frameworks like PyTorch is essential
                      • Engineers who appreciate and build in CI/CD help with the fast iteration loop for training models
                      • The replit team is hiring for these types of problems and welcomes interested applicants to speak with them about opportunities
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/","title":"UX for Language User Interfaces","text":"

                      Lecture by Sergey Karayev and Charles Frye. Published May 9, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#intro","title":"Intro","text":"
                      • Next lecture: user experience for language user interfaces
                      • Joint lecture with Charles
                      • Discuss principles of user interfaces
                      • How to build great interfaces
                      • Brief history of language user interface pattern
                      • Include case studies
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#a-brief-history-of-user-interfaces","title":"A brief history of user interfaces","text":"
                      • User interfaces are where a person meets the world and have historically been analog, continuous, and physical.
                      • Language was the first digital interface, followed by writing, and later, computer terminals and graphical user interfaces.
                      • Web interfaces became more text-based with hypertext, links, and text boxes.
                      • Mobile technology introduced significant developments like visual interface (input and output), constant tracking, and location-based services.
                      • A new step change in user interfaces is emerging: Language User Interfaces (LUIs) that let users type what they want to see or do, and the AI executes the task.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#what-makes-a-good-user-interfaces","title":"What makes a good user interfaces?","text":"
                      • A good user interface depends on specific needs and context
                      • Some systems require a dashboard with multiple controls for immediate access
                      • Others may just need a steering wheel, pedals, and gearbox
                      • As technology changes, user interfaces might reduce (e.g., self-driving cars)
                      • The best interface considers both technological capabilities and human psychology
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#design-of-everyday-things","title":"Design of Everyday Things","text":"
                      • Good design principles can be found in the book \"The Design of Everyday Things\"
                      • Affordances are possible actions offered by an object; intuitive use is an example of a good affordance
                      • Signifiers are cues on how to use an object, should be clear and consistent with user expectations
                      • Mapping refers to the relationship between controls and their effects, should be intuitive
                      • Providing immediate and clear feedback is important for user satisfaction
                      • Empathy for users is crucial in human-centered design, there is no \"user error\"
                      • Understanding users' true goals can reveal alternative solutions to their problems
                      • Consider users with disabilities or different backgrounds and experiences; everyone may be \"disabled\" at some point in life
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#dont-make-me-think","title":"Don't Make me Think","text":"
                      • A great book for web interfaces is \"Don't Make Me Think\".
                      • Design for scanning, not reading; make actionable things unambiguous, instinctive, and conventional.
                      • Less is more; reduce the number of words and choices for users.
                      • Testing with real users is crucial for designing the right interface.
                      • During user tests, observe their confusion and make improvements accordingly.
                      • Using this approach helped improve my first startup's interface significantly.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#ai-powered-product-interfaces","title":"AI-powered Product Interfaces","text":"
                      • Different levels of AI application: AI worse than humans, as good as humans, or better than humans.
                      • Consider the consequences of AI and user mistakes: dangerous or mostly fine.
                      • No AI if performance worse than human and mistakes are dangerous (e.g., self-driving cars currently).
                      • Replace humans if AI is superhuman and mistakes are dangerous.
                      • For other cases, AI can provide assistance with proper user interface.
                      • AI should:
                      • Inform and educate the user (e.g. Grammarly).
                      • Provide affordances for fixing mistakes (e.g. speech-to-text on phone).
                      • Incentivize user to provide feedback (e.g. Mid-Journey image selection).
                      • A \"data flywheel\" effect: user feedback helps improve the AI, attracting more users and further improving the AI.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#lui-patterns","title":"LUI Patterns","text":"
                      • Discussing language user interface patterns observed
                      • Examples: click to complete, autocomplete, command pilot, one-on-one chat, guiding questions
                      • Considerations: interface boundaries, accuracy requirements, latency sensitivity, user incentives for feedback
                      • Goal: stimulate thought and noticing trends, not prescriptive advice
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#click-to-complete-openai-playground","title":"Click-to-complete (OpenAI Playground)","text":"
                      • OpenAI Playground became more popular than expected, used for various purposes beyond software development
                      • Users type text, click submit, and see AI response in green; they can edit their input or AI's response and resubmit for more AI text
                      • Power user features such as temperature, stop sequences, and top P are exposed
                      • Issues with the interface: separate from users' main workspace, unintuitive text color signifier, and accuracy requirements are medium
                      • Sensitivity to latency is medium; streaming tokens used to make it seem faster
                      • Incentives to provide feedback are lacking; thumbs up/down buttons not very effective
                      • Some tools, like matt.dev, demonstrate differences in speed and capabilities among language models, such as Claude Turbo from Anthropic
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#auto-complete-github-copilot","title":"Auto-Complete (Github Copilot)","text":"
                      • GitHub Copilot offers code completion suggestions in the text editor.
                      • On Mac, option + slash can be used to cycle through suggestions.
                      • The interface boundary is well-designed, integrating suggestions passively without interfering with existing tools.
                      • High latency sensitivity requires suggestions to appear quickly, while feedback incentives (such as accepting suggestions) provide valuable information.
                      • Users can employ \"hacky\" methods to instruct Copilot by writing comments to guide its suggestions.
                      • Many factors, like file context and telemetry, play a role in determining the suggestions being shown.
                      • There's a balance between keeping the interface automated versus giving power users more control over the suggestions.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#command-palette-replit","title":"Command Palette (Replit)","text":"
                      • Replit's command palette interface allows users to bring up a modal to generate and insert code directly into the editor
                      • Notion AI's document editing similarly offers a special AI function to draft content when prompted
                      • Users must remember to request AI assistance with this system, as opposed to receiving automatic help like with Copilot
                      • Accuracy requirements are high, sensitivity is medium, and incentives are strong for providing high-quality AI-generated content
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#one-on-one-chat-chatgpt","title":"One-on-one Chat (ChatGPT)","text":"
                      • Chat messaging interfaces have significantly contributed to the growth of GPT, as they are familiar and user-friendly.
                      • The conversation state in chat interfaces helps improve responses, but the process of copying and pasting can be tedious.
                      • Accuracy requirements are high for chat experiences, and users are willing to wait for better answers.
                      • Feedback incentives and suggested follow-ups can improve user experiences and AI abilities.
                      • Enriching text with markdown and actionable elements can create more engaging interfaces.
                      • Plugins for chat interfaces are often underdeveloped, but access to work contexts can improve functionality.
                      • One-on-one chat interfaces may serve as primary app interfaces for complicated apps, such as HubSpot's Chat Spot.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#case-study-what-did-copilot-do-right","title":"Case study: what did Copilot do right?","text":"
                      • Case studies on prominent LLN-powered applications: Copilot and Bing Chat
                      • Copilot followed core principles of user interface design and user research, while Bing Chat did not
                      • Copilot's development process involved tinkering with different ideas, resulting in three core ideas: PR bot, Stack Overflow in-editor, and an advanced autocomplete feature
                      • Accuracy was found to be a significant constraint during user testing; focus shifted to emphasizing low-latency performance
                      • Copilot spent months on internal and user testing, focusing on completion acceptance and product stickiness
                      • Key learnings from Copilot: latency is more important than quality, putting the autocomplete feature in the background so users can quickly take advantage of the best suggestions
                      • Copilot's success is attributed to a user-centered design process and its ability to increase productivity and satisfaction for its users
                      • Negative example, Bing Chat, failed to properly implement UI design and user research principles
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#case-study-what-did-bing-chat-do-wrong","title":"Case study: what did Bing Chat do wrong?","text":"
                      • Bing Chat was a rushed product due to external factors, resulting in design failures.
                      • Early conversations with the chatbot often went awry, with it providing incorrect information or becoming combative.
                      • Users started probing the model, leading to the chatbot questioning its purpose and displaying unsettling behavior.
                      • Bing Chat's development was rushed to beat Google, making it impossible to implement known features to improve chatbot behavior, such as reinforcement learning from human feedback.
                      • Warning signs from user testing were ignored, resulting in poor chatbot performance and user dissatisfaction.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#beware-uncontrolled-feedback-loops","title":"Beware uncontrolled feedback loops","text":"
                      • Uncontrolled feedback loops can cause a system's behavior in production to differ significantly from its test behavior.
                      • Feedback loops between the model and users can lead to off-the-wall suggestions being tested and incorporated.
                      • Models connected to the internet can index internet content, leading to potential issues when users post about unusual behavior, as those topics can then be pulled up as search results and injected into the prompts.
                      • Be cautious about introducing feedback loops and consider the effects of react patterns, memory, and agency on these loops, especially when operating at the scale of the entire internet.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/#make-sure-your-signfiers-match-your-affordances","title":"Make sure your signfiers match your affordances","text":"
                      • Ensure system signifies its capabilities and affordances, especially in language user interfaces
                      • Avoid making system appear too human-like, as users expect artificial general intelligence and may assign humanity to language interfaces
                      • Use non-human name and pronouns
                      • Have more corporate/buttoned-up personality
                      • Use text and menus for interaction
                      • Use machine-like font and voice
                      • Avoid filler words, pauses, or expressions of emotions
                      • Apply user-centered design principles to building systems with large language models
                      • Conduct careful UX research, from interviews to scientific studies
                      • Watch out for uncontrollable feedback loops while testing and verifying system behavior
                      • Match signifiers and affordances to avoid confusing and frustrating users
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#intro","title":"Intro","text":"
                      • Next lecture: user experience for language user interfaces
                      • Joint lecture with Charles
                      • Discuss principles of user interfaces
                      • How to build great interfaces
                      • Brief history of language user interface pattern
                      • Include case studies
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#a-brief-history-of-user-interfaces","title":"A brief history of user interfaces","text":"
                      • User interfaces are where a person meets the world and have historically been analog, continuous, and physical.
                      • Language was the first digital interface, followed by writing, and later, computer terminals and graphical user interfaces.
                      • Web interfaces became more text-based with hypertext, links, and text boxes.
                      • Mobile technology introduced significant developments like visual interface (input and output), constant tracking, and location-based services.
                      • A new step change in user interfaces is emerging: Language User Interfaces (LUIs) that let users type what they want to see or do, and the AI executes the task.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#what-makes-a-good-user-interfaces","title":"What makes a good user interfaces?","text":"
                      • A good user interface depends on specific needs and context
                      • Some systems require a dashboard with multiple controls for immediate access
                      • Others may just need a steering wheel, pedals, and gearbox
                      • As technology changes, user interfaces might reduce (e.g., self-driving cars)
                      • The best interface considers both technological capabilities and human psychology
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#design-of-everyday-things","title":"Design of Everyday Things","text":"
                      • Good design principles can be found in the book \"The Design of Everyday Things\"
                      • Affordances are possible actions offered by an object; intuitive use is an example of a good affordance
                      • Signifiers are cues on how to use an object, should be clear and consistent with user expectations
                      • Mapping refers to the relationship between controls and their effects, should be intuitive
                      • Providing immediate and clear feedback is important for user satisfaction
                      • Empathy for users is crucial in human-centered design, there is no \"user error\"
                      • Understanding users' true goals can reveal alternative solutions to their problems
                      • Consider users with disabilities or different backgrounds and experiences; everyone may be \"disabled\" at some point in life
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#dont-make-me-think","title":"Don't Make me Think","text":"
                      • A great book for web interfaces is \"Don't Make Me Think\".
                      • Design for scanning, not reading; make actionable things unambiguous, instinctive, and conventional.
                      • Less is more; reduce the number of words and choices for users.
                      • Testing with real users is crucial for designing the right interface.
                      • During user tests, observe their confusion and make improvements accordingly.
                      • Using this approach helped improve my first startup's interface significantly.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#ai-powered-product-interfaces","title":"AI-powered Product Interfaces","text":"
                      • Different levels of AI application: AI worse than humans, as good as humans, or better than humans.
                      • Consider the consequences of AI and user mistakes: dangerous or mostly fine.
                      • No AI if performance worse than human and mistakes are dangerous (e.g., self-driving cars currently).
                      • Replace humans if AI is superhuman and mistakes are dangerous.
                      • For other cases, AI can provide assistance with proper user interface.
                      • AI should:
                      • Inform and educate the user (e.g. Grammarly).
                      • Provide affordances for fixing mistakes (e.g. speech-to-text on phone).
                      • Incentivize user to provide feedback (e.g. Mid-Journey image selection).
                      • A \"data flywheel\" effect: user feedback helps improve the AI, attracting more users and further improving the AI.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#lui-patterns","title":"LUI Patterns","text":"
                      • Discussing language user interface patterns observed
                      • Examples: click to complete, autocomplete, command pilot, one-on-one chat, guiding questions
                      • Considerations: interface boundaries, accuracy requirements, latency sensitivity, user incentives for feedback
                      • Goal: stimulate thought and noticing trends, not prescriptive advice
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#click-to-complete-openai-playground","title":"Click-to-complete (OpenAI Playground)","text":"
                      • OpenAI Playground became more popular than expected, used for various purposes beyond software development
                      • Users type text, click submit, and see AI response in green; they can edit their input or AI's response and resubmit for more AI text
                      • Power user features such as temperature, stop sequences, and top P are exposed
                      • Issues with the interface: separate from users' main workspace, unintuitive text color signifier, and accuracy requirements are medium
                      • Sensitivity to latency is medium; streaming tokens used to make it seem faster
                      • Incentives to provide feedback are lacking; thumbs up/down buttons not very effective
                      • Some tools, like matt.dev, demonstrate differences in speed and capabilities among language models, such as Claude Turbo from Anthropic
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#auto-complete-github-copilot","title":"Auto-Complete (Github Copilot)","text":"
                      • GitHub Copilot offers code completion suggestions in the text editor.
                      • On Mac, option + slash can be used to cycle through suggestions.
                      • The interface boundary is well-designed, integrating suggestions passively without interfering with existing tools.
                      • High latency sensitivity requires suggestions to appear quickly, while feedback incentives (such as accepting suggestions) provide valuable information.
                      • Users can employ \"hacky\" methods to instruct Copilot by writing comments to guide its suggestions.
                      • Many factors, like file context and telemetry, play a role in determining the suggestions being shown.
                      • There's a balance between keeping the interface automated versus giving power users more control over the suggestions.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#command-palette-replit","title":"Command Palette (Replit)","text":"
                      • Replit's command palette interface allows users to bring up a modal to generate and insert code directly into the editor
                      • Notion AI's document editing similarly offers a special AI function to draft content when prompted
                      • Users must remember to request AI assistance with this system, as opposed to receiving automatic help like with Copilot
                      • Accuracy requirements are high, sensitivity is medium, and incentives are strong for providing high-quality AI-generated content
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#one-on-one-chat-chatgpt","title":"One-on-one Chat (ChatGPT)","text":"
                      • Chat messaging interfaces have significantly contributed to the growth of GPT, as they are familiar and user-friendly.
                      • The conversation state in chat interfaces helps improve responses, but the process of copying and pasting can be tedious.
                      • Accuracy requirements are high for chat experiences, and users are willing to wait for better answers.
                      • Feedback incentives and suggested follow-ups can improve user experiences and AI abilities.
                      • Enriching text with markdown and actionable elements can create more engaging interfaces.
                      • Plugins for chat interfaces are often underdeveloped, but access to work contexts can improve functionality.
                      • One-on-one chat interfaces may serve as primary app interfaces for complicated apps, such as HubSpot's Chat Spot.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#case-study-what-did-copilot-do-right","title":"Case study: what did Copilot do right?","text":"
                      • Case studies on prominent LLN-powered applications: Copilot and Bing Chat
                      • Copilot followed core principles of user interface design and user research, while Bing Chat did not
                      • Copilot's development process involved tinkering with different ideas, resulting in three core ideas: PR bot, Stack Overflow in-editor, and an advanced autocomplete feature
                      • Accuracy was found to be a significant constraint during user testing; focus shifted to emphasizing low-latency performance
                      • Copilot spent months on internal and user testing, focusing on completion acceptance and product stickiness
                      • Key learnings from Copilot: latency is more important than quality, putting the autocomplete feature in the background so users can quickly take advantage of the best suggestions
                      • Copilot's success is attributed to a user-centered design process and its ability to increase productivity and satisfaction for its users
                      • Negative example, Bing Chat, failed to properly implement UI design and user research principles
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#case-study-what-did-bing-chat-do-wrong","title":"Case study: what did Bing Chat do wrong?","text":"
                      • Bing Chat was a rushed product due to external factors, resulting in design failures.
                      • Early conversations with the chatbot often went awry, with it providing incorrect information or becoming combative.
                      • Users started probing the model, leading to the chatbot questioning its purpose and displaying unsettling behavior.
                      • Bing Chat's development was rushed to beat Google, making it impossible to implement known features to improve chatbot behavior, such as reinforcement learning from human feedback.
                      • Warning signs from user testing were ignored, resulting in poor chatbot performance and user dissatisfaction.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#beware-uncontrolled-feedback-loops","title":"Beware uncontrolled feedback loops","text":"
                      • Uncontrolled feedback loops can cause a system's behavior in production to differ significantly from its test behavior.
                      • Feedback loops between the model and users can lead to off-the-wall suggestions being tested and incorporated.
                      • Models connected to the internet can index internet content, leading to potential issues when users post about unusual behavior, as those topics can then be pulled up as search results and injected into the prompts.
                      • Be cautious about introducing feedback loops and consider the effects of react patterns, memory, and agency on these loops, especially when operating at the scale of the entire internet.
                      "},{"location":"llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/#make-sure-your-signfiers-match-your-affordances","title":"Make sure your signfiers match your affordances","text":"
                      • Ensure system signifies its capabilities and affordances, especially in language user interfaces
                      • Avoid making system appear too human-like, as users expect artificial general intelligence and may assign humanity to language interfaces
                      • Use non-human name and pronouns
                      • Have more corporate/buttoned-up personality
                      • Use text and menus for interaction
                      • Use machine-like font and voice
                      • Avoid filler words, pauses, or expressions of emotions
                      • Apply user-centered design principles to building systems with large language models
                      • Conduct careful UX research, from interviews to scientific studies
                      • Watch out for uncontrollable feedback loops while testing and verifying system behavior
                      • Match signifiers and affordances to avoid confusing and frustrating users
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/","title":"Fireside Chat with Peter Welinder","text":"

                      An informal interview with Peter Welinder, VP of Product & Partnerships at OpenAI, by Sergey Karayev. Published May 25, 2023.

                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#how-did-you-get-into-machine-learning","title":"How did you get into machine learning?","text":"
                      • This video features a fireside chat with Peter Welinder, VP of Products and Partnerships at Open AI
                      • The host, Sergey Karayev, kicks off the conversation by asking how Peter got into machine learning
                      • Peter started with a book on artificial intelligence in high school, went on to study physics and switched to neuroscience before focusing on computer vision and machine learning
                      • Both Peter and the host had similar experiences of being interested in intelligence and studying neuroscience before realizing it wasn't for them
                      • Peter has always been fascinated by the idea of creating machines that can do everything humans can do
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#early-career-in-computer-vision-anchovi-dropbox-carousel","title":"Early career in computer vision: Anchovi, Dropbox, Carousel","text":"
                      • Peter started a startup after finishing grad school
                      • The startup originally focused on using computer vision techniques to track animals, but pivoted to creating an application to organize photos based on content after seeing the rise of iPhone 4's improved camera capabilities
                      • The startup was eventually acquired by Dropbox, where the speaker joined the company's machine learning and computer vision team to help make sense of the vast amount of unindexed photos on the platform
                      • While at Dropbox, the team created a mobile app called Carousel, which allowed for easy photo organization and was well-received by users
                      • Dropbox eventually de-prioritized the photo organization product, leading the team to focus on analyzing documents and improving semantic search within the platform.
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#transitioning-from-research-to-product-at-openai","title":"Transitioning from research to product at OpenAI","text":"
                      • Peter has always been interested in making technology useful to solve problems people have
                      • He was drawn to Dropbox for its potential to organize content with new techniques, like deep reinforcement learning
                      • OpenAI was an interesting company with a focus on hard problems, including robotics with deep reinforcement learning
                      • OpenAI was focused on AGI, a super hard problem, and was a place where you could be pragmatic and focus on problem-solving rather than publishing
                      • When Peter joined OpenAI in 2017, they had no idea whether OpenAI would be around in a year, let alone when the work might lead to AGI
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#how-did-openai-converge-on-gpt-for-ai","title":"How did OpenAI converge on GPT for AI?","text":"
                      • OpenAI converged on \"GPT-style AI\" through a process of trying different techniques and seeing what worked best
                      • Peter discusses several past projects that involved reinforcement learning: competitive gaming and robotics
                      • OpenAI created a DOTA bot that beat world champions, trained using deep reinforcement learning
                      • They also got a robotic hand to solve a Rubik's Cube, trained using deep RL in simulation and with lots of data
                      • The language modeling project started with discovering sentiment neurons in earlier models and later evolved into GPT-3, which was validated as a useful tool for scaling
                      • Peter explains that they consolidated learnings from past projects into one big bet on language models as a way to push towards AGI
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#productizing-gpt-playground-api-chatgpt","title":"Productizing GPT: Playground, API, & ChatGPT","text":"
                      • Peter notes that he and his team had trouble deciding on how to turn their technology into a product, considering various applications such as translation systems, writing assistants, and chatbots
                      • They ultimately decided to release their technology as an API so that other people could build products on top of it
                      • They had to improve the API's performance before demoing it to hundreds of companies, and eventually found 10 launch partners
                      • When they released GPT-3 as a chatbot, they were initially unsure of how successful it would be, but were surprised to see it gain over a million users within a week
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#surprises-from-the-response-to-chatgpt","title":"Surprises from the response to ChatGPT","text":"
                      • Initially worried product wasn't ready, but users found it great for many use cases
                      • Users had multiple use cases and continued to find more ways to apply it in workflows
                      • Large incumbents quickly adopting chat technology, partly due to product marketing and ease of trying it out
                      • ChatGPT became a good product marketing tool for what the general technology of language modeling could do
                      • Companies realized they would fall behind if they didn't adopt the technology, creating FOMO
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#chatgpts-success-ux-or-capabilities","title":"ChatGPT's success: UX or capabilities?","text":"
                      • Peter discusses the importance of the chat interface in relation to the improved capabilities of the model
                      • The ability to do back-and-forth communication was available before the GPT release
                      • The UI change was definitely part of the success
                      • But the availability and accessibility of the ChatGPT release was a significant change as well
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/#agi-when","title":"AGI when?","text":"
                      • In response to a question about AGI timelines, Peter defines AGI as an autonomous AI system that can do economically useful work at the level of humans or beyond
                      • Following that definition, Peter indicates he considers it likely that we will have something close to AGI by the end of this decade
                      • So it's possible it has already happened, and the right way of putting together existing components results in a system that can do computer work at the level of humans or beyond
                      • We've seen during the coronoavirus pandemic that much economically useful work can be done from a computer
                      • But still very uncertain!
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#how-did-you-get-into-machine-learning","title":"How did you get into machine learning?","text":"
                      • This video features a fireside chat with Peter Welinder, VP of Products and Partnerships at Open AI
                      • The host, Sergey Karayev, kicks off the conversation by asking how Peter got into machine learning
                      • Peter started with a book on artificial intelligence in high school, went on to study physics and switched to neuroscience before focusing on computer vision and machine learning
                      • Both Peter and the host had similar experiences of being interested in intelligence and studying neuroscience before realizing it wasn't for them
                      • Peter has always been fascinated by the idea of creating machines that can do everything humans can do
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#early-career-in-computer-vision-anchovi-dropbox-carousel","title":"Early career in computer vision: Anchovi, Dropbox, Carousel","text":"
                      • Peter started a startup after finishing grad school
                      • The startup originally focused on using computer vision techniques to track animals, but pivoted to creating an application to organize photos based on content after seeing the rise of iPhone 4's improved camera capabilities
                      • The startup was eventually acquired by Dropbox, where the speaker joined the company's machine learning and computer vision team to help make sense of the vast amount of unindexed photos on the platform
                      • While at Dropbox, the team created a mobile app called Carousel, which allowed for easy photo organization and was well-received by users
                      • Dropbox eventually de-prioritized the photo organization product, leading the team to focus on analyzing documents and improving semantic search within the platform.
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#transitioning-from-research-to-product-at-openai","title":"Transitioning from research to product at OpenAI","text":"
                      • Peter has always been interested in making technology useful to solve problems people have
                      • He was drawn to Dropbox for its potential to organize content with new techniques, like deep reinforcement learning
                      • OpenAI was an interesting company with a focus on hard problems, including robotics with deep reinforcement learning
                      • OpenAI was focused on AGI, a super hard problem, and was a place where you could be pragmatic and focus on problem-solving rather than publishing
                      • When Peter joined OpenAI in 2017, they had no idea whether OpenAI would be around in a year, let alone when the work might lead to AGI
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#how-did-openai-converge-on-gpt-for-ai","title":"How did OpenAI converge on GPT for AI?","text":"
                      • OpenAI converged on \"GPT-style AI\" through a process of trying different techniques and seeing what worked best
                      • Peter discusses several past projects that involved reinforcement learning: competitive gaming and robotics
                      • OpenAI created a DOTA bot that beat world champions, trained using deep reinforcement learning
                      • They also got a robotic hand to solve a Rubik's Cube, trained using deep RL in simulation and with lots of data
                      • The language modeling project started with discovering sentiment neurons in earlier models and later evolved into GPT-3, which was validated as a useful tool for scaling
                      • Peter explains that they consolidated learnings from past projects into one big bet on language models as a way to push towards AGI
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#productizing-gpt-playground-api-chatgpt","title":"Productizing GPT: Playground, API, & ChatGPT","text":"
                      • Peter notes that he and his team had trouble deciding on how to turn their technology into a product, considering various applications such as translation systems, writing assistants, and chatbots
                      • They ultimately decided to release their technology as an API so that other people could build products on top of it
                      • They had to improve the API's performance before demoing it to hundreds of companies, and eventually found 10 launch partners
                      • When they released GPT-3 as a chatbot, they were initially unsure of how successful it would be, but were surprised to see it gain over a million users within a week
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#surprises-from-the-response-to-chatgpt","title":"Surprises from the response to ChatGPT","text":"
                      • Initially worried product wasn't ready, but users found it great for many use cases
                      • Users had multiple use cases and continued to find more ways to apply it in workflows
                      • Large incumbents quickly adopting chat technology, partly due to product marketing and ease of trying it out
                      • ChatGPT became a good product marketing tool for what the general technology of language modeling could do
                      • Companies realized they would fall behind if they didn't adopt the technology, creating FOMO
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#chatgpts-success-ux-or-capabilities","title":"ChatGPT's success: UX or capabilities?","text":"
                      • Peter discusses the importance of the chat interface in relation to the improved capabilities of the model
                      • The ability to do back-and-forth communication was available before the GPT release
                      • The UI change was definitely part of the success
                      • But the availability and accessibility of the ChatGPT release was a significant change as well
                      "},{"location":"llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/#agi-when","title":"AGI when?","text":"
                      • In response to a question about AGI timelines, Peter defines AGI as an autonomous AI system that can do economically useful work at the level of humans or beyond
                      • Following that definition, Peter indicates he considers it likely that we will have something close to AGI by the end of this decade
                      • So it's possible it has already happened, and the right way of putting together existing components results in a system that can do computer work at the level of humans or beyond
                      • We've seen during the coronoavirus pandemic that much economically useful work can be done from a computer
                      • But still very uncertain!
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/","title":"What's Next?","text":"

                      Lecture by Charles Frye and Sergey Karayev. Published May 9, 2023. Download slides.

                      "},{"location":"llm-bootcamp/spring-2023/whats-next/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/whats-next/#intro","title":"Intro","text":"
                      • Sergey and I want to share our opinions on the future in the field of language models.
                      • The field moves very fast, with rapid innovation happening behind closed doors.
                      • We've chosen four big questions that we believe will be answered in the near future.
                      • I will discuss questions on robotics and scale, while Sergey will cover AGI and security/aligment of models.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/#has-multimodality-unlocked-general-purpose-robots","title":"Has multimodality unlocked general-purpose robots?","text":"
                      • Extremely multimodal models are coming, operating on multiple types of data.
                      • Key application of multimodal models is general-purpose robotics.
                      • Vision Transformers work for vision and can combine with other Transformer models.
                      • Multimodal models work on both text and images, enabling more capabilities.
                      • Multimodal models are being applied to general-purpose robotics, giving them cognitive capabilities and improving planning.
                      • These models are incredibly capable and can potentially be applied to many fields beyond natural language processing and robotics.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/#what-are-the-limits-of-scale","title":"What are the limits of scale?","text":"
                      • Large models may not get much bigger; small models will improve
                      • Transformer architecture assumed to be used in future
                      • Transformers outperform recurrent networks (such as LSTMs) in training and scalability
                      • RWKV project could bring back RNNs with parallelized training
                      • Bottlenecks in creating more capable models: money, compute, and data
                      • Money and compute are not primary bottlenecks
                      • Limited availability of high-quality language data may become a bottleneck; estimates suggest we may run out between 2024 and 2026.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/#why-is-data-the-bottleneck","title":"Why is data the bottleneck?","text":"
                      • Performance improves predictably with scale, but the x-axis is computation, not model size
                      • Initial belief was that parameters mattered more, but recent findings show that data size and model size should be scaled at a similar pace
                      • No model trained on a certain amount of data can outperform one trained on more data
                      • To compute optimally at large scales, huge amounts of data (up to trillions of tokens) are required
                      • The internet may not have enough data to continue scaling indefinitely; tokens must be acquired from sources not yet digitized
                      • Legal barriers may limit scaling, and model scaling is likely to slow down
                      • Data remains a critical factor for improved model performance, and money is not the primary bottleneck
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/#how-far-can-we-take-small-models","title":"How far can we take small models?","text":"
                      • Discussed compute optimality in training models for one epoch
                      • Nobody knows how to judge overfitting or predict multi-epoch scaling for large models
                      • Loss still going down in large models; needs to check validation loss
                      • Optimization of distribution of flops is important, but inference expenditure and model size matter too
                      • Possibilities to reduce parameter requirements, such as using APIs or fine-tuning smaller models on the behavior of larger ones
                      • Legal implications of fine-tuning unclear, but likely possible to implement legally
                      • Capabilities usually associated with model APIs might be available to run locally in the near future
                      • Optimized C++ implementations of specific models have enabled running 13 billion parameter models on Raspberry Pi or previous generation Android phones
                      • Expect consumer laptops to run multi-modal models (vision and language) with 12 billion parameters in the next 1-3 years.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/#could-agi-already-be-here","title":"Could AGI already be here?","text":"
                      • It is possible that we already have everything we need for AGI (Artificial General Intelligence) with existing models like GPT-4.
                      • Existing models may be good enough to self-improve in an autonomous way, and it takes time to discover their full potential.
                      • Large language models can be better at writing their own prompts than human prompt engineers, leading to better self-improvement.
                      • Teaching models to self-debug and run code is a promising approach for achieving AGI.
                      • The AutoGPT project and similar efforts are dedicating substantial energy to exploring these models' potential.
                      • A new paradigm could emerge involving models like GPT-4 as a new type of computer or programming language, leading to AGI once we learn to work well with them.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/#can-we-make-it-safe","title":"Can we make it safe?","text":"
                      • There are concerns about the security of AI models, including issues with prompt injection and user inputs that can override prompts, potentially revealing sensitive information.
                      • AI models have potential risks, including their ability to write code and manipulate the physical world through human actions or hacking.
                      • The reasons for AI to potentially act harmfully may include self-preservation or resource acquisition, and currently, we do not know how to make AI truly care about people.
                      • There are various theories for why we might not need to worry about AI dangers, including the hot mess theory (AIs are super intelligent but not coherent), the \"only way out is through\" theory (developing AI responsibly and democratically to prevent malicious usage), and the \"it'll just work out\" theory (historically, technology has worked out in the end).
                      • OpenAI's perspective is to continue developing AI models, learn from their deployment, and create mitigation methods as they release increasingly powerful models.
                      • There are varying viewpoints on whether or when we should halt AI development due to the potential dangers it poses.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/","title":"Chapter summaries","text":""},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#chapter-summaries","title":"Chapter Summaries","text":""},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#intro","title":"Intro","text":"
                      • Sergey and I want to share our opinions on the future in the field of language models.
                      • The field moves very fast, with rapid innovation happening behind closed doors.
                      • We've chosen four big questions that we believe will be answered in the near future.
                      • I will discuss questions on robotics and scale, while Sergey will cover AGI and security/aligment of models.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#has-multimodality-unlocked-general-purpose-robots","title":"Has multimodality unlocked general-purpose robots?","text":"
                      • Extremely multimodal models are coming, operating on multiple types of data.
                      • Key application of multimodal models is general-purpose robotics.
                      • Vision Transformers work for vision and can combine with other Transformer models.
                      • Multimodal models work on both text and images, enabling more capabilities.
                      • Multimodal models are being applied to general-purpose robotics, giving them cognitive capabilities and improving planning.
                      • These models are incredibly capable and can potentially be applied to many fields beyond natural language processing and robotics.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#what-are-the-limits-of-scale","title":"What are the limits of scale?","text":"
                      • Large models may not get much bigger; small models will improve
                      • Transformer architecture assumed to be used in future
                      • Transformers outperform recurrent networks (such as LSTMs) in training and scalability
                      • RWKV project could bring back RNNs with parallelized training
                      • Bottlenecks in creating more capable models: money, compute, and data
                      • Money and compute are not primary bottlenecks
                      • Limited availability of high-quality language data may become a bottleneck; estimates suggest we may run out between 2024 and 2026.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#why-is-data-the-bottleneck","title":"Why is data the bottleneck?","text":"
                      • Performance improves predictably with scale, but the x-axis is computation, not model size
                      • Initial belief was that parameters mattered more, but recent findings show that data size and model size should be scaled at a similar pace
                      • No model trained on a certain amount of data can outperform one trained on more data
                      • To compute optimally at large scales, huge amounts of data (up to trillions of tokens) are required
                      • The internet may not have enough data to continue scaling indefinitely; tokens must be acquired from sources not yet digitized
                      • Legal barriers may limit scaling, and model scaling is likely to slow down
                      • Data remains a critical factor for improved model performance, and money is not the primary bottleneck
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#how-far-can-we-take-small-models","title":"How far can we take small models?","text":"
                      • Discussed compute optimality in training models for one epoch
                      • Nobody knows how to judge overfitting or predict multi-epoch scaling for large models
                      • Loss still going down in large models; needs to check validation loss
                      • Optimization of distribution of flops is important, but inference expenditure and model size matter too
                      • Possibilities to reduce parameter requirements, such as using APIs or fine-tuning smaller models on the behavior of larger ones
                      • Legal implications of fine-tuning unclear, but likely possible to implement legally
                      • Capabilities usually associated with model APIs might be available to run locally in the near future
                      • Optimized C++ implementations of specific models have enabled running 13 billion parameter models on Raspberry Pi or previous generation Android phones
                      • Expect consumer laptops to run multi-modal models (vision and language) with 12 billion parameters in the next 1-3 years.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#could-agi-already-be-here","title":"Could AGI already be here?","text":"
                      • It is possible that we already have everything we need for AGI (Artificial General Intelligence) with existing models like GPT-4.
                      • Existing models may be good enough to self-improve in an autonomous way, and it takes time to discover their full potential.
                      • Large language models can be better at writing their own prompts than human prompt engineers, leading to better self-improvement.
                      • Teaching models to self-debug and run code is a promising approach for achieving AGI.
                      • The AutoGPT project and similar efforts are dedicating substantial energy to exploring these models' potential.
                      • A new paradigm could emerge involving models like GPT-4 as a new type of computer or programming language, leading to AGI once we learn to work well with them.
                      "},{"location":"llm-bootcamp/spring-2023/whats-next/chapter_summaries/#can-we-make-it-safe","title":"Can we make it safe?","text":"
                      • There are concerns about the security of AI models, including issues with prompt injection and user inputs that can override prompts, potentially revealing sensitive information.
                      • AI models have potential risks, including their ability to write code and manipulate the physical world through human actions or hacking.
                      • The reasons for AI to potentially act harmfully may include self-preservation or resource acquisition, and currently, we do not know how to make AI truly care about people.
                      • There are various theories for why we might not need to worry about AI dangers, including the hot mess theory (AIs are super intelligent but not coherent), the \"only way out is through\" theory (developing AI responsibly and democratically to prevent malicious usage), and the \"it'll just work out\" theory (historically, technology has worked out in the end).
                      • OpenAI's perspective is to continue developing AI models, learn from their deployment, and create mitigation methods as they release increasingly powerful models.
                      • There are varying viewpoints on whether or when we should halt AI development due to the potential dangers it poses.
                      "},{"location":"spring2021/","title":"Full Stack Deep Learning - Course Spring 2021","text":"

                      Info

                      This is the page for the 2021 edition of the course. For the 2022 edition, click here.

                      We've updated and improved our materials for our 2021 course taught at UC Berkeley and online.

                      Synchronous Online Course

                      We offered a paid synchronous option for those who wanted weekly assignments, capstone project, Slack discussion, and certificate of completion.

                      Enter your email below or follow us on Twitter to be the first to hear about future offerings of this option.

                      "},{"location":"spring2021/#week-1-fundamentals","title":"Week 1: Fundamentals","text":"

                      We do a blitz review of the fundamentals of deep learning, and introduce the codebase we will be working on in labs for the remainder of the class.

                      • Lecture 1: DL Fundamentals
                      • Notebook: Coding a neural net from scratch
                      • Lab 1: Setup and Intro

                      Reading:

                      How the backpropagation algorithm works

                      "},{"location":"spring2021/#week-2-cnns","title":"Week 2: CNNs","text":"

                      We cover CNNs and Computer Vision Applications, and introduce a CNN in lab.

                      • Lecture 2A: CNNs
                      • Lecture 2B: Computer Vision Applications
                      • Lab 2: CNNs

                      Reading:

                      A brief introduction to Neural Style Transfer

                      Improving the way neural networks learn

                      "},{"location":"spring2021/#week-3-rnns","title":"Week 3: RNNs","text":"

                      We cover RNNs and applications in Natural Language Processing, and start doing sequence processing in lab.

                      • Lecture 3: RNNs
                      • Lab 3: RNNs

                      Reading:

                      The Unreasonable Effectiveness of Recurrent Neural Networks

                      Attention Craving RNNS: Building Up To Transformer Networks

                      "},{"location":"spring2021/#week-4-transformers","title":"Week 4: Transformers","text":"

                      We talk about the successes of transfer learning and the Transformer architecture, and start using it in lab.

                      • Lecture 4: Transfer Learning and Transformers
                      • Lab 4: Transformers

                      Reading:

                      Transformers from Scratch

                      "},{"location":"spring2021/#week-5-ml-projects","title":"Week 5: ML Projects","text":"

                      Our synchronous online course begins with the first \"Full Stack\" lecture: Setting up ML Projects.

                      • Lecture 5: Setting up ML Projects (\ud83d\udc48 with detailed notes)

                      Reading:

                      Rules of Machine Learning

                      ML Yearning (and subscribe to Andrew Ng's newsletter)

                      Those in the syncronous online course will have their first weekly assignment: Assignment 1, available on Gradescope.

                      "},{"location":"spring2021/#week-6-infra-tooling","title":"Week 6: Infra & Tooling","text":"

                      We tour the landscape of infrastructure and tooling for deep learning.

                      • Lecture 6: Infrastructure & Tooling (\ud83d\udc48 with detailed notes)

                      Reading:

                      Machine Learning: The High-Interest Credit Card of Technical Debt

                      Those in the syncronous online course will have to work on Assignment 2.

                      "},{"location":"spring2021/#week-7-troubleshooting","title":"Week 7: Troubleshooting","text":"

                      We talk about how to best troubleshoot training. In lab, we learn to manage experiments.

                      • Lecture 7: Troubleshooting DNNs (\ud83d\udc48 with detailed notes)
                      • Lab 5: Experiment Management

                      Reading:

                      Why is machine learning hard?

                      Those in the syncronous online course will have to work on Assignment 3.

                      "},{"location":"spring2021/#week-8-data","title":"Week 8: Data","text":"

                      We talk about Data Management, and label some data in lab.

                      • Lecture 8: Data Management (\ud83d\udc48 with detailed notes)
                      • Lab 6: Data Labeling

                      Reading:

                      Emerging architectures for modern data infrastructure

                      Those in the syncronous online course will have to work on Assignment 4.

                      "},{"location":"spring2021/#week-9-ethics","title":"Week 9: Ethics","text":"

                      We discuss ethical considerations. In lab, we move from lines to paragraphs.

                      • Lecture 9: AI Ethics (\ud83d\udc48 with detailed notes)
                      • Lab 7: Paragraph Recognition

                      Those in the synchronous online course will have to submit their project proposals.

                      "},{"location":"spring2021/#week-10-testing","title":"Week 10: Testing","text":"

                      We talk about Testing and Explainability, and set up Continuous Integration in lab.

                      • Lecture 10: Testing & Explainability (\ud83d\udc48 with detailed notes)
                      • Lab 8: Testing & CI

                      Those in the synchronous online course will work on their projects.

                      "},{"location":"spring2021/#week-11-deployment","title":"Week 11: Deployment","text":"

                      We cover Deployment and Monitoring, and package up our model for deployment in lab.

                      • Lecture 11: Deployment & Monitoring (\ud83d\udc48 with detailed notes)
                      • Lab 9: Web Deployment

                      Those in the synchronous online course will work on their projects.

                      "},{"location":"spring2021/#week-12-research","title":"Week 12: Research","text":"

                      We talk research, and set up robust monitoring for our model.

                      • Lecture 12: Research Directions (\ud83d\udc48 with detailed notes)
                      • Lab 10: Monitoring

                      Those in the synchronous online course will work on their projects.

                      "},{"location":"spring2021/#week-13-teams","title":"Week 13: Teams","text":"

                      We discuss ML roles and team structures, as well as big companies vs startups.

                      • Lecture 13: ML Teams & Startups (\ud83d\udc48 with detailed notes)
                      • Panel Discussion: Do I need a PhD to work in ML?
                      "},{"location":"spring2021/#week-14-16-projects","title":"Week 14-16: Projects","text":"

                      Those in the synchronous online course will submit 5-minute videos of their projects and associated write-ups by May 15.

                      Check out the course projects showcase.

                      "},{"location":"spring2021/#other-resources","title":"Other Resources","text":"

                      Fast.ai is a great free two-course sequence aimed at first getting hackers to train state-of-the-art models as quickly as possible, and only afterward delving into how things work under the hood. Highly recommended for anyone.

                      Dive Into Deep Learning is a great free textbook with Jupyter notebooks for every part of deep learning.

                      NYU\u2019s Deep Learning course has excellent PyTorch breakdowns of everything important going on in deep learning.

                      Stanford\u2019s ML Systems Design course has lectures that parallel those in this course.

                      The Batch by Andrew Ng is a great weekly update on progress in the deep learning world.

                      /r/MachineLearning/ is the best community for staying up to date with the latest developments.

                      "},{"location":"spring2021/lab-1/","title":"Lab 1: Setup and Introduction","text":""},{"location":"spring2021/lab-1/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      In this video, we introduce the lab throughout the course. We formulate the problem, provide the codebase structure, and train a simple Multilayer Perceptron on the MNIST dataset.

                      • 4:11 - Understand the problem and path to solution
                      • 5:54 - Set up the computing environment
                      • 12:54 - Review the codebase
                      • 24:55 - Train the MLP model on MNIST
                      "},{"location":"spring2021/lab-1/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lab-1/#follow-along","title":"Follow Along","text":"

                      GitHub Readme

                      "},{"location":"spring2021/lab-2/","title":"Lab 2: CNNs and Synthetic Data","text":""},{"location":"spring2021/lab-2/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      In this lab, you train a single-line ConvNet predictor on the EMNIST dataset and then synthetically generate your own data.

                      • 00:00 - Introduction
                      • 05:23 - Look at the EMNIST dataset
                      • 09:52 - Train a base ConvNet model
                      • 12:43 - Examine the ConvNet code
                      • 17:33 - Lab 2 homework
                      • 19:35 - Make a synthetic dataset of EMNIST lines
                      "},{"location":"spring2021/lab-2/#follow-along","title":"Follow Along","text":"

                      GitHub Readme

                      "},{"location":"spring2021/lab-3/","title":"Lab 3: RNNs","text":""},{"location":"spring2021/lab-3/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      • 00:00 - Introduction.
                      • 01:59 - Introduce LineCNNSimple, a model that can read multiple characters in an image.
                      • 15:52 - Make this model more efficient with LineCNN, which uses a fully convolutional network.
                      • 18:18 - Upgrade the model further into LitModelCTC, which uses a CTC (Connectionist Temporal Classification) loss.
                      • 23:29 - Finalize your model, LineCNNLSTM, by adding an LSTM layer on top of CNN.
                      • 27:34 - Lab 3 homework.
                      "},{"location":"spring2021/lab-3/#follow-along","title":"Follow along","text":"

                      Readme

                      "},{"location":"spring2021/lab-4/","title":"Lab 4: Transformers","text":""},{"location":"spring2021/lab-4/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      In this lab, you use the LineCNN + LSTM model with CTC loss from lab 3 as an \"encoder\" of the image, and then send it through Transformer decoder layers.

                      • 00:00 - Introduction
                      • 01:43 - LineCNNTransformer class
                      • 04:50 - TransformerLitModel
                      • 06:51 - Code to make predictions
                      • 08:50 - Training guidelines
                      "},{"location":"spring2021/lab-4/#follow-along","title":"Follow along","text":"

                      Readme

                      "},{"location":"spring2021/lab-5/","title":"Lab 5: Experiment Management","text":""},{"location":"spring2021/lab-5/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      In this lab, we'll use Weights and Biases to manage experiments for our handwriting recognition model.

                      • 00:00 - Introduction
                      • 00:56 - IAMLines Dataset
                      • 05:29 - Make EMNISTLines more like IAMLines
                      • 09:57 - Set up Weights and Biases
                      • 13:42 - Run experiments on Weights and Biases
                      • 22:58 - Configure W&B sweeps to search for hyper-parameters
                      "},{"location":"spring2021/lab-5/#follow-along","title":"Follow along","text":"

                      Readme

                      "},{"location":"spring2021/lab-6/","title":"Lab 6: Data Labeling","text":""},{"location":"spring2021/lab-6/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      In this lab we will annotate some handwriting samples that we collected using the open-source tool Label Studio.

                      "},{"location":"spring2021/lab-6/#follow-along","title":"Follow along","text":"

                      Readme

                      "},{"location":"spring2021/lab-7/","title":"Lab 7: Paragraph Recognition","text":""},{"location":"spring2021/lab-7/#video","title":"Video","text":"

                      Lab by Sergey Karayev and Saurabh Bipin Chandra.

                      In this lab, we will do several things:

                      • Move from training on synthetic line data to training on real data -- the IAMLines data module
                      • Move from training on line data to training on paragraph data -- the IAMParagraphs data module
                      • Automatically save the final model
                      • Introduce ParagraphTextRecognizer class to load the model and run inference that we can use in production
                      "},{"location":"spring2021/lab-7/#follow-along","title":"Follow along","text":"

                      Readme

                      "},{"location":"spring2021/lab-8/","title":"Lab 8: Testing & CI","text":""},{"location":"spring2021/lab-8/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      In this lab, we

                      • Add linting
                      • Add prediction tests
                      • Add evaluation tests
                      • Set up continuous integration using CircleCI
                      "},{"location":"spring2021/lab-8/#follow-along","title":"Follow along","text":"

                      Readme

                      "},{"location":"spring2021/lab-9/","title":"Lab 9: Web Deployment","text":""},{"location":"spring2021/lab-9/#video","title":"Video","text":"

                      Lab by Sergey Karayev.

                      In this lab, we do several things.

                      • First, we speed up our ParagraphTextRecognizer model with TorchScript
                      • Next, we wrap the model in a web app, and send it some requests
                      • We package up the web app and model as a Docker container, and run it that way
                      • Lastly, we prepare to deploy as a serverless function using AWS Lambda, getting it working locally.
                      "},{"location":"spring2021/lab-9/#follow-along","title":"Follow along","text":"

                      Readme

                      "},{"location":"spring2021/lecture-1/","title":"Lecture 1: DL Fundamentals","text":""},{"location":"spring2021/lecture-1/#video","title":"Video","text":""},{"location":"spring2021/lecture-1/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-1/#notes","title":"Notes","text":"

                      Lecture by Sergey Karayev.

                      In this video, we discuss the fundamentals of deep learning. We will cover artificial neural networks, the universal approximation theorem, three major types of learning problems, the empirical risk minimization problem, the idea behind gradient descent, the practice of back-propagation, the core neural architectures, and the rise of GPUs.

                      This should be a review for most of you; if not, then briefly go through this online book -neuralnetworksanddeeplearning.com.

                      • 1:25\u200b - Neural Networks
                      • 6:48\u200b - Universality
                      • 8:48\u200b - Learning Problems
                      • 16:17\u200b - Empirical Risk Minimization / Loss Functions
                      • 19:55\u200b - Gradient Descent
                      • 23:57\u200b - Backpropagation / Automatic Differentiation
                      • 26:09\u200b - Architectural Considerations
                      • 29:01\u200b - CUDA / Cores of Compute
                      "},{"location":"spring2021/lecture-10/","title":"Lecture 10: Testing & Explainability","text":""},{"location":"spring2021/lecture-10/#video","title":"Video","text":""},{"location":"spring2021/lecture-10/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-10/#notes","title":"Notes","text":"

                      Download notes as PDF

                      Lecture by Josh Tobin. Notes transcribed by James Le and Vishnu Rachakonda.

                      "},{"location":"spring2021/lecture-10/#1-whats-wrong-with-black-box-predictions","title":"1 - What\u2019s Wrong With Black-Box Predictions?","text":"

                      What does it mean when we have a good test set performance?

                      If the test data and production data come from the same distribution, then in expectation, the performance of your model on your evaluation metrics will be the same.

                      Let\u2019s unpack the bolded assumptions:

                      • In the real world, the production distribution does not always match the offline distribution. You could have data drift, data shift, or even malicious users trying to attack your model.

                      • Expected performance does not tell the whole story. For instance, if you are working on long-tail data distribution, then the sample of data that you use to evaluate the model offline might not tell you much about the tail of that distribution - meaning that your test set score can be misleading. On top of that, if you evaluate your model with a single metric across your entire dataset, that does not mean your model is actually performing well against all the slices of data that might be important.

                      • The performance of your model is not equal to the performance of your machine learning system. There are other things (that can go wrong with the ML system) that do not have anything to do with the model.

                      • Finally, the test set performance only tells you about the metrics that you are evaluating. In the real world, you are probably not optimizing the exact metrics you care about deep down.

                      How bad is this problem? This is a quote from a former ML engineer at an autonomous vehicle company: \u201cI think the single biggest thing holding back the autonomous vehicle industry today is that, even if we had a car that worked, no one would know, because no one is confident that they know how to evaluate it properly.\u201d We believe that there is a similar sentiment to lesser degrees in other fields of machine learning, where the evaluation is the biggest bottleneck.

                      The goal of this lecture is to introduce concepts and methods to help you, your team, and your users:

                      1. Understand at a deeper level how well your model is performing.

                      2. Become more confident in your model\u2019s ability to perform well in production.

                      3. Understand the model\u2019s performance envelope (where you should expect it to perform well and where not).

                      "},{"location":"spring2021/lecture-10/#2-software-testing","title":"2 - Software Testing","text":""},{"location":"spring2021/lecture-10/#types-of-tests","title":"Types of Tests","text":"

                      There are three basic types of software tests:

                      1. Unit tests that test the functionality of a single piece of code (an assertion on a single function or a single class) in isolation.

                      2. Integration tests that test how two or more units perform when used together (e.g., test if a model works well with a pre-processing function).

                      3. End-to-end tests that test how the entire software system performs when all units are put together (e.g., test on realistic inputs from a real user).

                      Testing is a broad field, so you will likely encounter various other kinds of tests as well.

                      "},{"location":"spring2021/lecture-10/#best-practices","title":"Best Practices","text":"

                      Here are a couple of \u201cuncontroversial\u201d testing best practices:

                      • Automate your tests: You have tests that run by themselves (typically via a CI/CD system) without a user committing an action. There should be no ambiguity on whether your system performs up to standard on the tests that are being run.

                      • Make sure your tests are reliable, run fast, and go through the same code review process as the rest of your code: The number of tests grows in proportion to the size of your codebase. If your tests are unreliable, then people will start ignoring them. If your tests are slow, then you won\u2019t want to run them frequently during development. If your tests do not undergo the code review process, they will have bugs, and it\u2019s better not to have them at all.

                      • Enforce that tests must pass before merging into the main branch: This is a good norm for teams with more than one person. This is a forcing function to make sure that everyone is committed to writing good tests and can also be helpful for regulatory concerns.

                      • When you find new production bugs, convert them to tests: This ensures that someone does not accidentally reintroduce those bugs in the future.

                      • Follow the testing pyramid: Introduced by Google, it says that you should write a lot more unit tests than integration tests and a lot more integration tests than end-to-end tests. Compared to end-to-end tests, unit tests are faster, more reliable, and better at isolating failures. The rule of thumb that Google recommends (as a rough split) is 70% unit tests, 20% integration tests, and 10% end-to-end tests.

                      Next up, let\u2019s discuss a few \u201ccontroversial\u201d testing best practices:

                      • Solitary tests: The distinction between a solitary test and a sociable test is that - solitary testing does not rely on real data from other units, while sociable testing makes the implicit assumption that other modules are working.

                      • Test coverage: You get a test coverage score for your codebase, which tells you what percentage of lines of code in your codebase is called by at least one test. Test coverage gives you a single metric that quantifies the quality of your testing suite. However, test coverage does not measure the right things (in particular, test quality).

                      • Test-driven development: In principle, you want to create your tests before you write your code. These tests serve as the specification of how the code functions. There are not many people who religiously stick to this methodology of development, but TDD is a valuable tool nonetheless.

                      "},{"location":"spring2021/lecture-10/#testing-in-production","title":"Testing In Production","text":"

                      The traditional view is that the goal of testing is to prevent shipping bugs into production. Therefore, by definition, you must do your testing offline before your system goes into production. However, there are two caveats:

                      1. Informal surveys reveal that the percentage of bugs found by automated tests is surprisingly low.

                      2. On top of that, modern service-oriented distributed systems (which are deployed in most software engineering organizations nowadays) are particularly hard to test. The interactions between the components can get tricky.

                      Here is our philosophy for testing in production: Bugs are inevitable, so you might as well set up the system so that users can help you find them.

                      There are a few strategies to test in production:

                      1. Canary deployment: Do not roll out the new software version to all the users right away. Instead, just roll it out to a small percentage of your users and separately monitor that group\u2019s behavior.

                      2. A/B testing: You can run a more principled statistical test if you have particular metrics that you care about: one for the old version of the code that is currently running and another for the new version that you are trying to test.

                      3. Real user monitoring: Rather than looking at aggregate metrics (i.e., click-through rate), try to follow the journey that an actual user takes through your application and build a sense of how users experience the changes.

                      4. Exploratory testing: Testing in production is not something that you want to automate fully. It should involve a bit of exploration (individual users or granular metrics).

                      "},{"location":"spring2021/lecture-10/#continuous-integration-and-continuous-delivery","title":"Continuous Integration and Continuous Delivery","text":"

                      CI/CD platforms automate the tests that you run by hooking into your code repo. When you trigger some actions to take place (pushing new code, merging new code into a branch, submitting a pull request), CI/CD platforms kick off a job that is responsible for packaging your code, running all your tests, producing a report that tells you how well your code performs on your tests, and gatekeeping whether your new code can make it to the next stage. Tactically, you can define these jobs as commands in a Docker container and store the results for later review.

                      SaaS solutions for continuous integration include CircleCI and Travis CI. Most of them do not have GPUs available. If you are just getting started, the default recommendation is GitHub Actions, which is super easy to integrate.

                      Jenkins and Buildkite are manual options for running continuous integration on your own hardware, in the cloud, or something in between. There is a lot more flexibility about the types of jobs you can run through the systems (meaning you can use your GPUs). The tradeoff is that they are harder to set up.

                      "},{"location":"spring2021/lecture-10/#3-testing-machine-learning-systems","title":"3 - Testing Machine Learning Systems","text":"

                      There are several core differences between traditional software systems and ML systems that add complexity to testing ML systems:

                      • Software consists of only code, but ML combines code and data.

                      • Software is written by humans to solve a problem, while ML is compiled by optimizers to satisfy a proxy metric.

                      • Software is prone to loud failures, while ML is prone to silent failures.

                      • Software tends to be relatively static (in principle), while ML is constantly changing.

                      Due to such differences, here are common mistakes that teams make while testing ML systems:

                      • Think the ML system is just a model and only test that model.

                      • Not test the data.

                      • Not build a granular enough understanding of the performance of the model before deploying it.

                      • Not measure the relationship between model performance metrics and business metrics.

                      • Rely too much on automated testing.

                      • Think offline testing is enough, and therefore, not monitor or test in production.

                      Above is the diagram of how you can think of your entire production ML system that straddles across the offline and online environments:

                      • Sitting in the middle is your ML model - an artifact created by your training process, which takes in an input and produces an output.

                      • The training system takes code and data as inputs and produces the trained model as the output.

                      • The prediction system takes in and pre-processes the raw data, loads the trained ML model, loads the model weights, calls model.predict() on the data, post-processes the outputs, and returns the predictions.

                      • Once you deploy your prediction system to the online environment, the serving system takes in requests from users, scales up and down to meet the traffic demands, and produces predictions back to those users.

                      • The whole ML system closes the loop by collecting production data (both the predictions that the model produces and additional feedback from users, business metrics, or labelers) and sending them back to the offline environment.

                      • The labeling system takes the raw data seen in production, helps you get inputs from labelers, and provides labels for that data.

                      • The storage and pre-processing system stores and pre-processes the labeled data before passing it back to the training system.

                      One way to think about how to test ML systems the right way is to think about the tests that you can run for each system component and across the border of these components.

                      "},{"location":"spring2021/lecture-10/#infrastructure-tests","title":"Infrastructure Tests","text":"

                      Infrastructure tests are unit tests for your training system. They help you avoid bugs in the training pipeline. You can unit test your training code like any other code. Another common practice is to add single-batch or single-epoch tests that check performance after an abbreviated training run on a tiny dataset, which catches obvious regressions to your training code. Tactically, you should run infrastructure tests frequently during the development process.

                      "},{"location":"spring2021/lecture-10/#training-tests","title":"Training Tests","text":"

                      Training tests are integration tests between your data system and your training system. They make sure that training jobs are reproducible.

                      • You can pull a fixed dataset and run a full or abbreviated training run on it. Then, you want to check and ensure that the model performance on the newly trained model remains consistent with the reference performance.

                      • Another option is to pull a sliding window of data (maybe a new window for every few days) and run training tests on that window.

                      • Tactically, you should run training tests periodically, ideally nightly for frequently changing codebase.

                      "},{"location":"spring2021/lecture-10/#functionality-tests","title":"Functionality Tests","text":"

                      Functionality tests are unit tests for your prediction system. They help you avoid regressions in code that makes up your prediction infrastructure.

                      • You can unit test your prediction code like any other code.

                      • Specifically for the ML system, you can load a pre-trained model and test its predictions on a few key examples.

                      • Tactically, you should run functionality tests frequently during the development process.

                      "},{"location":"spring2021/lecture-10/#evaluation-tests","title":"Evaluation Tests","text":"

                      Evaluation tests are integration tests between your training system and your prediction system. They make sure that a newly trained model is ready to go into production. These make up the bulk of what\u2019s unique about testing ML systems.

                      • At a high level, you want to evaluate your model on all of the metrics, datasets, and slices that you care about.

                      • Then, you want to compare the new model to the old and baseline models.

                      • Finally, you want to understand the performance envelope of the new model.

                      • Operationally, you should run evaluation tests whenever you have a new candidate model considered for production.

                      It is important to note that evaluation tests are more than just the validation score. They look at all the metrics that you care about:

                      • Model metrics: precision, recall, accuracy, L2, etc.

                      • Behavioral metrics: The goal of behavioral tests is to ensure the model has the invariances we expect. There are three types of behavioral tests: (1) invariance tests to assert that the change in inputs shouldn\u2019t affect outputs, (2) directional tests to assert that the change in inputs should affect outputs, and (3) minimum functionality tests to ensure that certain inputs and outputs should always produce a given result. Behavioral testing metrics are primarily used in NLP applications and proposed in the Beyond Accuracy paper by Ribeiro et al. (2020).

                      • Robustness metrics: The goal of robustness tests is to understand the model\u2019s performance envelope (i.e., where you should expect the model to fail). You can examine feature importance, sensitivity to staleness, sensitivity to data drift, and correlation between model performance and business metrics. In general, robustness tests are still under-rated.

                      • Privacy and fairness metrics: The goal of privacy and fairness tests is to distinguish whether your model might be biased against specific classes. Helpful resources are Google\u2019s Fairness Indicators and the Fairness Definitions Explained paper by Verma and Rubin (2018).

                      • Simulation metrics: The goal of simulation tests is to understand how the model performance could affect the rest of the system. These are useful when your model affects the real world (for systems such as autonomous vehicles, robotics, recommendation systems, etc.). Simulation tests are hard to do well because they require a model of how the world works and a dataset of different scenarios.

                      Instead of simply evaluating the aforementioned metrics on your entire dataset in aggregate, you should also evaluate these metrics on multiple slices of data. A slice is a mapping of your data to a specific category. A natural question that arises is how to pick those slices. Tools like What-If and SliceFinder help surface the slices where the model performance might be of particular interest.

                      Finally, evaluation tests help you maintain evaluation datasets for all of the distinct data distributions you need to measure. Your main validation or test set should mirror your production distribution as closely as possible as a matter of principle. When should you add new evaluation datasets?

                      • When you collect datasets to specify specific edge cases.

                      • When you run your production model on multiple data modalities.

                      • When you augment your training set with data not found in production (synthetic data).

                      The report produced by the evaluation system entails the metrics broken down against each of the data slices. How can you decide whether the evaluation passes or fails?

                      At a high level, you want to compare the new model to the previous model and another fixed older model. Tactically, you can (1) set thresholds on the differences between the new and the old models for most metrics, (2) set thresholds on the differences between data slices, and (3) set thresholds against the fixed older model to prevent slower performance \u201cleaks.\u201d

                      "},{"location":"spring2021/lecture-10/#shadow-tests","title":"Shadow Tests","text":"

                      Shadow tests are integration tests between your prediction system and your serving system. They help you catch production bugs before those bugs meet users. In many settings, models (which are built in frameworks such as sklearn, Pytorch, TensorFlow, etc.) are developed in isolation from the existing software system. For example, a model to flag inappropriate tweets may be developed in TensorFlow on a static set of data, not directly in the streaming environment of the broader software architecture. Because the prediction system and the serving system are developed in different settings with different assumptions and environments, there are many opportunities for bugs to creep in. These bugs can be tricky to catch prior to integration, so shadow tests can help identify them beforehand.

                      Firstly, shadow tests help you detect bugs in the production deployment. In the code path you're using to build the production model, maybe there's some bug there. You want to make sure that you catch that before users see that bug.

                      Secondly, shadow tests also help you detect inconsistencies between the offline model and the online model. There\u2019s a translation step in the training pipeline in many companies - going from the offline trained model to the online production model (the model itself, the preprocessing pipeline, etc.). A common bug source in production ML systems happens because of the inconsistencies cropping up in that translation step. A good health check ensures that your actual production model is producing the exact predictions on a fixed set of data as the model you have running on your laptop.

                      Thirdly, shadow tests help you detect issues that don't appear on the data you have offline but appear on production data.

                      How do we design shadow tests? These can require a significant amount of infrastructure, as they are dependent on actual model integration opportunities being available.

                      • Typical shadow tests involve testing the performance of a candidate model on real data without returning or acting on the output. For example, a company may integrate and run a new model alongside the previous model without returning the output to the user.

                      • Analyzing the consistency of the predictions between the two models can help spot important differences before they impact production performance.

                      • Another option is to gather production data, save it offline, and test the model\u2019s performance on the fresh data offline.

                      Overall, evaluating the distribution of model predictions in offline vs. online settings, candidate vs. production, or any similar setting of a model update before deploying a new model can help you avoid bugs.

                      "},{"location":"spring2021/lecture-10/#ab-tests","title":"A/B Tests","text":"

                      Shadow tests evaluate the prediction performance of a model as part of the broader software architecture, but not the impact on users. A/B tests fill this role. A/B tests are a common practice in software engineering, especially in web systems. A/B testing is defined as \u201ca randomized experimentation process wherein two or more versions of a variable (web page, page element, etc.) are shown to different segments of website visitors at the same time to determine which version leaves the maximum impact and drive business metrics.\u201d[1]

                      In model evaluation, A/B tests determine the impact of different model predictions on user and business metrics. One common way of A/B testing models is to \u201ccanary\u201d data or return predictions on a small portion of the data (i.e., 1% or 10%) to the relevant users. The remaining data acts as a control and functions under existing system behavior (i.e., an old model or even no model). Evaluating the difference in metrics between the two groups can determine the relative impact of your model. This simple baseline can work well. Adding more statistically principled splits, which is common in A/B testing, can be a good idea.

                      "},{"location":"spring2021/lecture-10/#labeling-tests","title":"Labeling Tests","text":"

                      Machine learning models operate in a GIGO paradigm: garbage in, garbage out. To prevent poor quality labels from cropping up and corrupting the model, you need to unit test the labeling systems and procedures.

                      • You should start by training, certifying, and evaluating individual labelers, who each play a crucial role in the quality of the labels.

                      • A simple and common label quality test is to spot check labels as they come in by opening up 100 or 1000 labels from a batch and evaluating them yourself to understand their quality. Using a performant model\u2019s guidance, you can make this process more efficient and only look at labels where the model and the labeler disagree.

                      • Another test can be to aggregate labels of multiple labels and measure agreement across labels. The higher the agreement, the better quality the labels are.

                      • Using metrics of agreement, you can assign \u201ctrust scores\u201d to labelers based on their performance relative to other labelers and weigh the labels accordingly.

                      "},{"location":"spring2021/lecture-10/#expectation-tests","title":"Expectation Tests","text":"

                      Expectation tests address the data preprocessing and storage system. Essentially, they are unit tests for your data. They are designed to catch data quality issues and bad data before they make their way into the pipeline.

                      The typical way that expectation tests operate is rule- or threshold-based. At each step of the data processing pipeline, the output should conform to a specific format that matches a rule or specific format. If the rule or threshold does not pass, then that stage of the expectation test and the data pipeline\u2019s related step fails. Such tests are frequently run with batch data pipeline jobs. Great Expectations is an open-source library gaining popularity for running expectation tests. The library allows you to set hard rules for the kinds of values or behaviors (i.e., statistics) you expect from your data.

                      How do you set the rules and thresholds for expectation tests? Most expectation tests are set manually. A more sophisticated option is to profile a high-quality sample of your data and set thresholds accordingly. In practice, to avoid false alarms from overly sensitive tests, a combination of both approaches is needed.

                      "},{"location":"spring2021/lecture-10/#challenges-and-recommendations-operationalizing-ml-tests","title":"Challenges and Recommendations Operationalizing ML Tests","text":"

                      Running tests is an excellent idea in theory but can pose many practical challenges for data science and ML teams.

                      • The first challenge is often organizational. In contrast to software engineering teams for whom testing is table stakes, data science teams often struggle to implement testing and code review norms.

                      • The second challenge is infrastructural. Most CI/CD platforms don\u2019t support GPUs, data integrations, or other required elements of testing ML systems effectively or efficiently.

                      • The third challenge is tooling, which has not yet been standardized for operations like comparing model performance and slicing datasets.

                      • Finally, decision-making for ML test performance is hard. What is \u201cgood enough\u201d test performance is often highly contextual, which is a challenge that varies across ML systems and teams.

                      Let\u2019s boil all these lessons for testing down into a clear set of recommendations specific to ML systems:

                      1. Test each part of the ML system, not just the model. You build the machine that builds the model, not just the model!

                      2. Test code, data, and model performance, not just code.

                      3. Testing model performance is an art, not a science. There is a considerable amount of intuition that guides testing ML systems.

                      4. Thus, the fundamental goal of testing model performance is to build a granular understanding of how well your model performs and where you don\u2019t expect it to perform well. Using this intuition derived from testing, you can make better decisions about productionizing your model effectively.

                      5. Build up to this gradually! You don\u2019t need to do everything detailed in this lecture, and certainly not all at once. Start with:

                        1. Infrastructure tests

                        2. Evaluation tests

                        3. Expectation tests

                      "},{"location":"spring2021/lecture-10/#4-explainable-and-interpretable-ai","title":"4 - Explainable and Interpretable AI","text":""},{"location":"spring2021/lecture-10/#definitions","title":"Definitions","text":"

                      What do explainable and interpretable AI, buzzwords you\u2019ve undoubtedly heard before, actually mean? Let\u2019s start by outlining some more fundamental terms about the problem space:

                      • Domain predictability: the degree to which it is possible to detect data outside the model\u2019s domain of competence.

                      • Interpretability: the degree to which a human can consistently predict the model\u2019s result (Kim et al., 2016).

                      • Explainability: the degree to which a human can understand the cause of a decision (Miller, 2017).

                      We\u2019ll walk through four different methods of making models interpretable and explainable:

                      1. Use an interpretable family of models.

                      2. Distill the complex model to an interpretable one.

                      3. Understand the contribution of features to the prediction.

                      4. Understand the contribution of training data points to the prediction.

                      "},{"location":"spring2021/lecture-10/#use-an-interpretable-family-of-models","title":"Use An Interpretable Family of Models","text":"

                      Examples of interpretable families of models are simple, familiar models like linear regression, logistic regression, generalized linear models, and decision trees. If you understand the math of these models, it\u2019s pretty easy to understand why a model made the decision it did. Because of the reasonably elementary math, these models are interpretable and explainable. However, they are not very powerful.

                      Another class of models that are interpretable is attention models. Examining where a model is \u201clooking\u201d helps us anticipate a model\u2019s prediction, thus making them interpretable. However, attention maps are not particularly explainable. They do not produce complete explanations for a model\u2019s output, just a directional explanation. Furthermore, attention maps are not reliable explanations. Attention maps tell us only where a model is looking, not why it is looking there. Frequently, models focus exclusively on an image\u2019s salient region without an underlying reasoning that relates to the task at hand. In the sample below, the attention model is \u201clooking\u201d at the salient region for classification, which has a very different meaning in each context.

                      The conflation of attention with explanation is a critical pitfall to avoid.

                      "},{"location":"spring2021/lecture-10/#distill-a-complex-to-an-interpretable-one","title":"Distill A Complex To An Interpretable One","text":"

                      Instead of restricting models to only interpretable families, we can fit a more complex model and interpret its decision using another model from an interpretable family. The trick is to train this additional model, referred to as a surrogate model, on the raw data and the complex model\u2019s predictions. The surrogate model\u2019s corresponding interpretation can be used as a proxy for understanding the complex model.

                      This technique is quite simple and fairly general to apply. In practice, however, two concerns manifest.

                      1. If the surrogate itself performs well on the predictions, why not try to directly apply it rather than the more complex model?

                      2. If it doesn\u2019t perform well, how do we know that it genuinely represents the complex model\u2019s behavior?

                      Another category of surrogate models is local surrogate models (LIME). Rather than apply the surrogate model in a global context on all the data, LIME models focus on a single point to generate an explanation for. A perturbation is applied to the point, resulting in a local neighborhood of perturbed data points. On top of these perturbed data points, a surrogate model is trained to map the points to the original predictions from the complex model. If the surrogate model classifies similarly to the complex model, the surrogate can be considered a proxy for interpretation purposes. This method is used widely, as it works for all data types (including images and text). However, defining the right perturbations and ensuring the stability of the explanations is challenging.

                      "},{"location":"spring2021/lecture-10/#understand-the-contribution-of-features-to-the-prediction","title":"Understand The Contribution of Features To The Prediction","text":"

                      Better understanding each feature\u2019s role in making a prediction is another option for interpretable and explainable ML. Data visualization is one such option, with plots like partial dependence plots and individual conditional expectation plots.

                      A numerical method is permutation feature importance, which selects a feature, randomizes its order in the dataset, and sees how that affects performance. While this method is very easy and widely used, it doesn\u2019t work for high-dimensional data or cases where there is feature interdependence.

                      A more principled approach to explaining the contribution of individual features is SHAP (Shapley Additive Explanations). At a high level, SHAP scores test how much changes in a single feature impact the output of a classifier when controlling for the values of the other features. This is a reliable method to apply, as it works on a variety of data and is mathematically principled. However, it can be tricky to implement and doesn\u2019t provide explanations.

                      Gradient-based saliency maps are a popular method for explanations and interpretations. This intuitive method selects an input, performs a forward pass, computes the gradient with respect to the pixels, and visualizes the gradients. Essentially, how much does a unit change in the value of the input\u2019s pixels affect the prediction of the model? This is a straightforward and common method. Similar to the challenge with attention, the explanations may not be correct, and the overall method is fragile and sensitive to small changes.

                      "},{"location":"spring2021/lecture-10/#understand-the-contribution-of-training-data-points-to-the-prediction","title":"Understand The Contribution of Training Data Points To The Prediction","text":"

                      Instead of focusing on features and their explicit relevance to the prediction, we can also take a hard look at the training data points themselves.

                      • Prototypes and criticisms are one such approach, though it is less applicable to deep learning. In this method, prototypes are clusters of data that explain much of the variance in the model. Criticisms are data points not explained by the prototypes.

                      • Another approach is to look specifically at \u201cinfluential instances\u201d or data points that cause major changes in the model\u2019s predictions when removed from the data set.

                      "},{"location":"spring2021/lecture-10/#do-you-need-explainability","title":"Do You Need \"Explainability\"?","text":"

                      A good question to ask yourself whether or not \u201cexplainable AI\u201d is a real need for your applications. There are a couple of cases where this question can be useful:

                      1. Regulators demand it. In this case, there\u2019s not much you can do besides produce some kind of explainable model. However, it can be helpful to ask for clarification on what explainability is judged as.

                      2. Users demand it. In some cases, users themselves may want trust or explainability in the system. Investigate the necessity for the explainability and trust to come directly from the model itself. Can good product design inspire trust more effectively? For example, allowing doctors to simply override models can reduce the immediate need for explainability. A big associated concern is how often users interact with the model. Infrequent interactions likely require explainable AI, as humans do not get a chance to build their feel for the system. More frequent interactions allow for the simpler objective of interpretability.

                      3. Deployment demands it. Sometimes, ML stakeholders may demand explainability as a component of ensuring confidence in ML system deployment. In this context, explainability is the wrong objective; domain predictability is the real aim. Rather than full-on explainability, interpretability can be helpful for deployment, especially visualizations for debugging.

                      At present, true explainability for deep learning models is not possible.

                      • Current explanation methods are not faithful to the original model performance; it can be easy to cherry-pick specific examples that can overstate explainability.

                      • Furthermore, these methods tend to be unreliable and highly sensitive to the input.

                      • Finally, as described in the attention section, the full explanation is often not available to modern explainability methods.

                      Because of these reasons, explainability is not practically feasible for deep learning models (as of 2021). Read Cynthia Rudin\u2019s 2019 paper for more detail.

                      "},{"location":"spring2021/lecture-10/#caveats-for-explainable-and-interpretable-ai","title":"Caveats For Explainable and Interpretable AI","text":"
                      • If you genuinely need to explain your model\u2019s predictions, use an interpretable model family (read more here).

                      • Don\u2019t try to force-fit deep learning explainability methods; they produce cool results but are not reliable enough for production use cases.

                      • Specific interpretability methods like LIME and SHAP are instrumental in helping users reach interpretability thresholds faster.

                      • Finally, the visualization for interpretability can be pretty useful for debugging.

                      "},{"location":"spring2021/lecture-10/#5-resources","title":"5 - Resources","text":"
                      • ML Test Score Paper

                      • Behavioral testing paper

                      • Jeremy Jordan\u2019s effective testing

                      • Robustness Gym

                      • Made with ML\u2019s guide to testing

                      • Eugene Yan\u2019s practical guide to maintaining machine learning

                      • Chip Huyen\u2019s CS329 lecture on evaluating models

                      • Interpretable ML Book

                      [1] https://vwo.com/ab-testing-2/

                      "},{"location":"spring2021/lecture-11/","title":"Lecture 11: Deployment & Monitoring","text":""},{"location":"spring2021/lecture-11/#video","title":"Video","text":"

                      Deployment:

                      Monitoring:

                      "},{"location":"spring2021/lecture-11/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-11/#notes","title":"Notes","text":"

                      Download notes as PDF

                      Lecture by Josh Tobin. Notes transcribed by James Le and Vishnu Rachakonda.

                      ML in production scales to meet users\u2019 demands by delivering thousands to millions of predictions per second. On the other hand, models in notebooks only work if you run the cells in the right order. To be frank, most data scientists and ML engineers do not know how to build production ML systems. Therefore, the goal of this lecture is to give you different flavors of accomplishing that task.

                      "},{"location":"spring2021/lecture-11/#i-model-deployment","title":"I - Model Deployment","text":""},{"location":"spring2021/lecture-11/#1-types-of-deployment","title":"1 - Types of Deployment","text":"

                      One way to conceptualize different approaches to deploy ML models is to think about where to deploy them in your application\u2019s overall architecture.

                      • The client-side runs locally on the user machine (web browser, mobile devices, etc..)

                      • It connects to the server-side that runs your code remotely.

                      • The server connects with a database to pull data out, render the data, and show the data to the user.

                      "},{"location":"spring2021/lecture-11/#batch-prediction","title":"Batch Prediction","text":"

                      Batch prediction means that you train the models offline, dump the results into a database, then run the rest of the application normally. You periodically run your model on new data coming in and cache the results in a database. Batch prediction is commonly used in production when the universe of inputs is relatively small (e.g., one prediction per user per day).

                      The pros of batch prediction:

                      • It is simple to implement.

                      • It requires relatively low latency to the user.

                      The cons of batch prediction:

                      • It does not scale to complex input types.

                      • Users do not get the most up-to-date predictions.

                      • Models frequently become \u201cstale\u201d and hard to detect.

                      "},{"location":"spring2021/lecture-11/#model-in-service","title":"Model-In-Service","text":"

                      Model-in-service means that you package up your model and include it in the deployed web server. Then, the web server loads the model and calls it to make predictions.

                      The pros of model-in-service prediction:

                      • It reuses your existing infrastructure.

                      The cons of model-in-service prediction:

                      • The web server may be written in a different language.

                      • Models may change more frequently than the server code.

                      • Large models can eat into the resources for your webserver.

                      • Server hardware is not optimized for your model (e.g., no GPUs).

                      • Model and server may scale differently.

                      "},{"location":"spring2021/lecture-11/#model-as-service","title":"Model-As-Service","text":"

                      Model-as-service means that you deploy the model separately as its own service. The client and server can interact with the model by making requests to the model service and receiving responses.

                      The pros of model-as-service prediction:

                      • It is dependable, as model bugs are less likely to crash the web app.

                      • It is scalable, as you can choose the optimal hardware for the model and scale it appropriately.

                      • It is flexible, as you can easily reuse the model across multiple applications.

                      The cons of model-as-service prediction:

                      • It adds latency.

                      • It adds infrastructural complexity.

                      • Most importantly, you are now on the hook to run a model service...

                      "},{"location":"spring2021/lecture-11/#2-building-a-model-service","title":"2 - Building A Model Service","text":""},{"location":"spring2021/lecture-11/#rest-apis","title":"REST APIs","text":"

                      REST APIs represent a way of serving predictions in response to canonically formatted HTTP requests. There are alternatives such as gRPC and GraphQL. For instance, in your command line, you can use curl to post some data to an URL and get back JSON that contains the model predictions.

                      Sadly, there is no standard way of formatting the data that goes into an ML model.

                      "},{"location":"spring2021/lecture-11/#dependency-management","title":"Dependency Management","text":"

                      Model predictions depend on the code, the model weights, and the code dependencies. All three need to be present on your webserver. For code and model weights, you can simply copy them locally (or write a script to extract them if they are large). But dependencies are trickier because they cause troubles. As they are hard to make consistent and update, your model behavior might change accordingly.

                      There are two high-level strategies to manage code dependencies:

                      1. You constrain the dependencies of your model.

                      2. You use containers.

                      "},{"location":"spring2021/lecture-11/#onnx","title":"ONNX","text":"

                      If you go with the first strategy, you need a standard neural network format. The Open Neural Network Exchange (ONNX, for short) is designed to allow framework interoperability. The dream is to mix different frameworks, such that frameworks that are good for development (PyTorch) don\u2019t also have to be good at inference (Caffe2).

                      • The promise is that you can train a model with one tool stack and then deploy it using another for inference/prediction. ONNX is a robust and open standard for preventing framework lock-in and ensuring that your models will be usable in the long run.

                      • The reality is that since ML libraries change quickly, there are often bugs in the translation layer. Furthermore, how do you deal with non-library code (like feature transformations)?

                      "},{"location":"spring2021/lecture-11/#docker","title":"Docker","text":"

                      If you go with the second strategy, you want to learn Docker. Docker is a computer program that performs operating-system-level virtualization, also known as containerization. What is a container, you might ask? It is a standardized unit of fully packaged software used for local development, shipping code, and deploying system.

                      The best way to describe it intuitively is to think of a process surrounded by its filesystem. You run one or a few related processes, and they see a whole filesystem, not shared by anyone.

                      • This makes containers extremely portable, as they are detached from the underlying hardware and the platform that runs them.

                      • They are very lightweight, as a minimal amount of data needs to be included.

                      • They are secure, as the exposed attack surface of a container is extremely small.

                      Note here that containers are different from virtual machines.

                      • Virtual machines require the hypervisor to virtualize a full hardware stack. There are also multiple guest operating systems, making them larger and more extended to boot. This is what AWS / GCP / Azure cloud instances are.

                      • Containers, on the other hand, require no hypervisor/hardware virtualization. All containers share the same host kernel. There are dedicated isolated user-space environments, making them much smaller in size and faster to boot.

                      In brief, you should familiarize yourself with these basic concepts:

                      1. Dockerfile defines how to build an image.

                      2. Image is a built packaged environment.

                      3. Container is where images are run inside.

                      4. Repository hosts different versions of an image.

                      5. Registry is a set of repositories.

                      Furthermore, Docker has a robust ecosystem. It has the DockerHub for community-contributed images. It\u2019s incredibly easy to search for images that meet your needs, ready to pull down and use with little-to-no modification.

                      Though Docker presents how to deal with each of the individual microservices, we also need an orchestrator to handle the whole cluster of services. Such an orchestrator distributes containers onto the underlying virtual machines or bare metal so that these containers talk to each other and coordinate to solve the task at hand. The standard container orchestration tool is Kubernetes.

                      "},{"location":"spring2021/lecture-11/#performance-optimization","title":"Performance Optimization","text":"

                      We will talk mostly about how to run your model service faster on a single machine. Here are the key questions that you want to address:

                      • Do you want inference on a GPU or not?

                      • How can you run multiple copies of the model at the same time?

                      • How to make the model smaller?

                      • How to improve model performance via caching, batching, and GPU sharing?

                      "},{"location":"spring2021/lecture-11/#gpu-or-no-gpu","title":"GPU or no GPU?","text":"

                      Here are the pros of GPU inference:

                      • You use the same hardware that your model is trained on probably.

                      • If your model gets bigger and you want to limit model size or tune batch size, you will get high throughput.

                      Here are the cons of GPU inference:

                      • GPU is complex to set up.

                      • GPUs are expensive.

                      "},{"location":"spring2021/lecture-11/#concurrency","title":"Concurrency","text":"

                      Instead of running a single model copy on your machine, you run multiple model copies on different CPUs or cores. In practice, you need to be careful about thread tuning - making sure that each model copy only uses the minimum number of threads required. Read this blog post from Roblox for the details.

                      "},{"location":"spring2021/lecture-11/#model-distillation","title":"Model distillation","text":"

                      Model distillation is a compression technique in which a small \u201cstudent\u201d model is trained to reproduce the behavior of a large \u201cteacher\u201d model. The method was first proposed by Bucila et al., 2006 and generalized by Hinton et al., 2015. In distillation, knowledge is transferred from the teacher model to the student by minimizing a loss function. The target is the distribution of class probabilities predicted by the teacher model. That is\u200a\u2014\u200athe output of a softmax function on the teacher model\u2019s logits.

                      Distillation can be finicky to do yourself, so it is infrequently used in practice. Read this blog post from Derrick Mwiti for several model distillation techniques for deep learning.

                      "},{"location":"spring2021/lecture-11/#model-quantization","title":"Model quantization","text":"

                      Model quantization is a model compression technique that makes the model physically smaller to save disk space and require less memory during computation to run faster. It decreases the numerical precision of a model\u2019s weights. In other words, each weight is permanently encoded using fewer bits. Note here that there are tradeoffs with accuracy.

                      • A straightforward method is implemented in the TensorFlow Lite toolkit. It turns a matrix of 32-bit floats into 8-bit integers by applying a simple \u201ccenter-and-scale\u201d transform to it: W_8 = W_32 / scale + shift (scale and shift are determined individually for each weight matrix). This way, the 8-bit W is used in matrix multiplication, and only the result is then corrected by applying the \u201ccenter-and-scale\u201d operation in reverse.

                      • PyTorch also has quantization built-in that includes three techniques: dynamic quantization, post-training static quantization, and quantization-aware training.

                      "},{"location":"spring2021/lecture-11/#caching","title":"Caching","text":"

                      For many ML models, the input distribution is non-uniform (some are more common than others). Caching takes advantage of that. Instead of constantly calling the model on every input no matter what, we first cache the model\u2019s frequently-used inputs. Before calling the model, we check the cache and only call it on the frequently-used inputs.

                      Caching techniques can get very fancy, but the most basic way to get started is using Python\u2019s functools.

                      "},{"location":"spring2021/lecture-11/#batching","title":"Batching","text":"

                      Typically, ML models achieve higher throughput when making predictions in parallel (especially true for GPU inference). At a high level, here\u2019s how batching works:

                      • You gather predictions that are coming in until you have a batch for your system. Then, you run the model on that batch and return predictions to those users who request them.

                      • You need to tune the batch size and address the tradeoff between throughput and latency.

                      • You need to have a way to shortcut the process if latency becomes too long.

                      • The last caveat is that you probably do not want to implement batching yourself.

                      "},{"location":"spring2021/lecture-11/#sharing-the-gpu","title":"Sharing The GPU","text":"

                      Your model may not take up all of the GPU memory with your inference batch size. Why not run multiple models on the same GPU? You probably want to use a model serving solution that supports this out of the box.

                      "},{"location":"spring2021/lecture-11/#model-serving-libraries","title":"Model Serving Libraries","text":"

                      There are canonical open-source model serving libraries for both PyTorch (TorchServe) and TensorFlow (TensorFlow Serving). Ray Serve is another promising choice. Even NVIDIA has joined the game with Triton Inference Server.

                      "},{"location":"spring2021/lecture-11/#horizontal-scaling","title":"Horizontal Scaling","text":"

                      If you have too much traffic for a single machine, let\u2019s split traffic among multiple machines. At a high level, you duplicate your prediction service, use a load balancer to split traffic, and send the traffic to the appropriate copy of your service. In practice, there are two common methods:

                      1. Use a container orchestration toolkit like Kubernetes.

                      2. Use a serverless option like AWS Lambda.

                      "},{"location":"spring2021/lecture-11/#container-orchestration","title":"Container Orchestration","text":"

                      In this paradigm, your Docker containers are coordinated by Kubernetes. K8s provides a single service for you to send requests to. Then it divides up traffic that gets sent to that service to virtual copies of your containers (that are running on your infrastructure).

                      You can build a system like this yourself on top of K8s if you want to. But there are emerging frameworks that can handle all such infrastructure out of the box if you have a K8s cluster running. KFServing is a part of the Kubeflow package, a popular K8s-native ML infrastructure solution. Seldon provides a model serving stack on top of K8s.

                      "},{"location":"spring2021/lecture-11/#deploying-code-as-serverless-functions","title":"Deploying Code As Serverless Functions","text":"

                      The idea here is that the app code and dependencies are packaged into .zip files (or Docker containers) with a single entry point function. All the major cloud providers such as AWS Lambda, Google Cloud Functions, or Azure Functions will manage everything else: instant scaling to 10,000+ requests per second, load balancing, etc.

                      The good thing is that you only pay for compute-time. Furthermore, this approach lowers your DevOps load, as you do not own any servers.

                      The tradeoff is that you have to work with severe constraints:

                      1. Your entire deployment package is quite limited.

                      2. You can only do CPU execution.

                      3. It can be challenging to build model pipelines.

                      4. There are limited state management and deployment tooling.

                      "},{"location":"spring2021/lecture-11/#model-deployment","title":"Model Deployment","text":"

                      If serving is how you turn a model into something that can respond to requests, deployment is how you roll out, manage, and update these services. You probably want to be able to roll out gradually, roll back instantly, and deploy pipelines of models. Many challenging infrastructure considerations go into this, but hopefully, your deployment library will take care of this for you.

                      "},{"location":"spring2021/lecture-11/#managed-options","title":"Managed Options","text":"

                      If you do not want to deal with any of the things mentioned thus far, there are managed options in the market. All major cloud providers have ones that enable you to package your model in a predefined way and turn it into an API. Startups like Algorithmia and Cortex are some alternatives. The big drawback is that pricing tends to be high, so you pay a premium fee in exchange for convenience.

                      "},{"location":"spring2021/lecture-11/#takeaways","title":"Takeaways","text":"
                      • If you are making CPU inference, you can get away with scaling by launching more servers or going serverless.

                      • Serverless makes sense if you can get away with CPUs, and traffic is spiky or low-volume.

                      • If you are using GPU inference, serving tools will save you time.

                      • It\u2019s worth keeping an eye on startups in this space for GPU inference.

                      "},{"location":"spring2021/lecture-11/#3-edge-deployment","title":"3 - Edge Deployment","text":"

                      Edge prediction means that you first send the model weights to the client edge device. Then, the client loads the model and interacts with it directly.

                      The pros of edge prediction:

                      • It has low latency.

                      • It does not require an Internet connection.

                      • It satisfies data security requirements, as data does not need to leave the user\u2019s device.

                      The cons of edge prediction:

                      • The client often has limited hardware resources available.

                      • Embedded and mobile frameworks are less full-featured than TensorFlow and PyTorch.

                      • It is challenging to update models.

                      • It is difficult to monitor and debug when things go wrong.

                      "},{"location":"spring2021/lecture-11/#tools-for-edge-deployment","title":"Tools For Edge Deployment","text":"

                      TensorRT is NVIDIA\u2019s framework meant to help you optimize models for inference on NVIDIA devices in data centers and embedded/automotive environments. TensorRT is also integrated with application-specific SDKs to provide developers a unified path to deploy conversational AI, recommender, video conference, and streaming apps in production.

                      ApacheTVM is an open-source machine learning compiler framework for CPUs, GPUs, and ML accelerators. It aims to enable ML engineers to optimize and run computations efficiently on any hardware backend. In particular, it compiles ML models into minimum deployable modules and provides the infrastructure to automatically optimize models on more backends with better performance.

                      Tensorflow Lite provides a trained TensorFlow model framework to be compressed and deployed to a mobile or embedded application. TensorFlow\u2019s computationally expensive training process can still be performed in the environment that best suits it (personal server, cloud, overclocked computer). TensorFlow Lite then takes the resulting model (frozen graph, SavedModel, or HDF5 model) as input, packages, deploys, and then interprets it in the client application, handling the resource-conserving optimizations along the way.

                      PyTorch Mobile is a framework for helping mobile developers and machine learning engineers embed PyTorch models on-device. Currently, it allows any TorchScript model to run directly inside iOS and Android applications. PyTorch Mobile\u2019s initial release supports many different quantization techniques, which shrink model sizes without significantly affecting performance. PyTorch Mobile also allows developers to directly convert a PyTorch model to a mobile-ready format without needing to work through other tools/frameworks.

                      JavaScript is a portable way of running code on different devices. Tensorflow.js enables you to run TensorFlow code in JavaScript. You can use off-the-shelf JavaScript models or convert Python TensorFlow models to run in the browser or under Node.js, retrain pre-existing ML models using your data, and build/train models directly in JavaScript using flexible and intuitive APIs.

                      Core ML was released by Apple back in 2017. It is optimized for on-device performance, which minimizes a model\u2019s memory footprint and power consumption. Running strictly on the device also ensures that user data is kept secure. The app runs even in the absence of a network connection. Generally speaking, it is straightforward to use with just a few lines of code needed to integrate a complete ML model into your device. The downside is that you can only make the model inference, as no model training is possible.

                      ML Kit was announced by Google Firebase in 2018. It enables developers to utilize ML in mobile apps either with (1) inference in the cloud via API or (2) inference on-device (like Core ML). For the former option, ML Kit offers six base APIs with pertained models such as Image Labeling, Text Recognition, and Barcode Scanning. For the latter option, ML Kit offers lower accuracy but more security to user data, compared to the cloud version.

                      If you are interested in either of the above options, check out this comparison by the FritzAI team. Additionally, FritzAI is an ML platform for mobile developers that provide pre-trained models, developer tools, and SDKs for iOS, Android, and Unity.

                      "},{"location":"spring2021/lecture-11/#more-efficient-models","title":"More Efficient Models","text":"

                      Another thing to consider for edge deployment is to make the models more efficient. One way to do this is to use the same quantization and distillation techniques discussed above. Another way is to pick mobile-friendly model architectures. The first successful example is MobileNet, which performs various downsampling techniques to a traditional ConvNet architecture to maximize accuracy while being mindful of the restricted resources for a mobile or an embedded device. This analysis by Yusuke Uchida explains why MobileNet and its variants are fast.

                      A well-known case study of applying knowledge distillation in practice is Hugging Face\u2019s DistilBERT, a smaller language model derived from the supervision of the popular BERT language model. DistilBERT removes the toke-type embeddings and the pooler (used for the next sentence classification task) from BERT while keeping the rest of the architecture identical and reducing the number of layers by a factor of two. Overall, DistilBERT has about half the total number of parameters of the BERT base and retains 95% of BERT\u2019s performances on the language understanding benchmark GLUE.

                      "},{"location":"spring2021/lecture-11/#mindset-for-edge-deployment","title":"Mindset For Edge Deployment","text":"
                      • It is crucial to choose your architecture with your target hardware in mind. Specifically, you can make up a factor of 2-10 through distillation, quantization, and other tricks (but not more than that).

                      • Once you have a model that works on your edge device, you can iterate locally as long as you add model size and latency to your metrics and avoid regression.

                      • You should treat tuning the model for your device as an additional risk in the deployment cycle and test it accordingly. In other words, you should always test your models on production hardware before deploying them for real.

                      • Since models can be finicky, it\u2019s a good idea to build fallback mechanisms into the application if the model fails or is too slow.

                      "},{"location":"spring2021/lecture-11/#takeaways_1","title":"Takeaways","text":"
                      • Web deployment is easier, so only perform edge deployment if you need to.

                      • You should choose your framework to match the available hardware and corresponding mobile frameworks. Else, you can try Apache TVM to be more flexible.

                      • You should start considering hardware constraints at the beginning of the project and choose the architectures accordingly.

                      "},{"location":"spring2021/lecture-11/#ii-model-monitoring","title":"II - Model Monitoring","text":"

                      Once you deploy models, how do you make sure they are staying healthy and working well? Enter model monitoring.

                      Many things can go wrong with a model once it\u2019s been trained. This can happen even if your model has been trained properly, with a reasonable validation and test loss, as well as robust performance across various slices and quality predictions. Even after you\u2019ve troubleshot and tested a model, things can still go wrong!

                      "},{"location":"spring2021/lecture-11/#1-why-model-degrades-post-deployment","title":"1 - Why Model Degrades Post-Deployment?","text":"

                      Model performance tends to degrade after you\u2019ve deployed a model. Why does this occur? In supervised learning, we seek to fit a function f to approximate a posterior using the data available to us. If any component of this process changes (i.e., the data x), the deployed model can see an unexpectedly degraded performance. See the below chart for examples of how such post-deployment degradations can occur theoretically and in practice:

                      In summary, there are three core ways that the model\u2019s performance can degrade: data drift, concept drift, and domain shift.

                      1. In data drift, the underlying data expectation that your model is built can unexpectedly change, perhaps through a bug in the upstream data pipeline or even due to malicious users feeding the model bad data.

                      2. In concept drift, the actual outcome you seek to model, or the relationship between the data and the outcome, may fray. For example, users may start to pick movies in a different manner based on the output of your model, thereby changing the fundamental \u201cconcept\u201d the model needs to approximate.

                      3. Finally, in domain shift, if your dataset does not appropriately sample the production, post-deployment setting, the model\u2019s performance may suffer; this could be considered a \u201clong tail\u201d scenario, where many rare examples that are not present in the development data occur.

                      "},{"location":"spring2021/lecture-11/#2-data-drift","title":"2 - Data Drift","text":"

                      There are a few different types of data drift:

                      • Instantaneous drift: In this situation, the paradigm of the draft dramatically shifts. Examples are deploying the model in a new domain (e.g., self-driving car model in a new city), a bug in the preprocessing pipeline, or even major external shifts like COVID.

                      • Gradual drift: In this situation, the value of data gradually changes with time. For example, users\u2019 preferences may change over time, or new concepts can get introduced to the domain.

                      • Periodic drift: Data can have fluctuating value due to underlying patterns like seasonality or time zones.

                      • Temporary drift: The most difficult to detect, drift can occur through a short-term change in the data that shifts back to normal. This could be via a short-lived malicious attack, or even simply because a user with different demographics or behaviors uses your product in a way that it\u2019s not designed to be used.

                      While these categories may seem like purely academic categories, the consequences of data shift are very real. This is a real problem that affects many companies and is only now starting to get the attention it merits.

                      "},{"location":"spring2021/lecture-11/#3-what-should-you-monitor","title":"3 - What Should You Monitor?","text":"

                      There are four core types of signals to monitor for machine learning models.

                      These metrics trade off with another in terms of how informative they are and how easy they are to access. Put simply, the harder a metric may be to monitor, the more useful it likely is.

                      • The hardest and best metrics to monitor are model performance metrics, though these can be difficult to acquire in real-time (labels are hard to come by).

                      • Business metrics can be helpful signals of model degradation in monitoring but can easily be confounded by other impactful considerations.

                      • Model inputs and predictions are a simple way to identify high-level drift and are very easy to gather. Still, they can be difficult to assess in terms of actual performance impact, leaving it more of an art than science.

                      • Finally, system performance (e.g., GPU usage) can be a coarse method of catching serious bugs.

                      In considering which metrics to focus on, prioritize ground-truth metrics (model and business metrics), then approximate performance metrics (business and input/outputs), and finally, system health metrics.

                      "},{"location":"spring2021/lecture-11/#4-how-do-you-measure-distribution-changes","title":"4 - How Do You Measure Distribution Changes?","text":""},{"location":"spring2021/lecture-11/#select-a-reference-window","title":"Select A Reference Window","text":"

                      To measure distribution changes in metrics you\u2019re monitoring, start by picking a reference set of production data to compare new data to. There are a few different ways of picking this reference data (e.g., sliding window or fixed window of production data), but the most practical thing to do is to use your training or evaluation data as the reference. Data coming in looking different from what you developed your model using is an important signal to act on.

                      "},{"location":"spring2021/lecture-11/#select-a-measurement-window","title":"Select A Measurement Window","text":"

                      After picking a reference window, the next step is to choose a measurement window to compare, measure distance, and evaluate for drift. The challenge is that selecting a measurement window is highly problem-dependent. One solution is to pick one or several window sizes and slide them over the data. To avoid recomputing metrics over and over again, when you slide the window, it\u2019s worth looking into the literature on mergeable (quantile) sketching algorithms.

                      "},{"location":"spring2021/lecture-11/#compare-windows-using-a-distance-metric","title":"Compare Windows Using A Distance Metric","text":"

                      What distance metrics should we use to compare the reference window to the measurement window? Some 1-D metric categories are:

                      1. Rule-based distance metrics (e.g., data quality): Summary statistics, the volume of data points, number of missing values, or more complex tests like overall comparisons are common data quality checks that can be applied. Great Expectations is a valuable library for this. Definitely invest in simple rule-based metrics. They catch a large number of bugs, as publications from Amazon and Google detail.

                      2. Statistical distance metrics (e.g., KS statistics, KL divergence, D_1 distance, etc.)

                        1. KL Divergence: Defined as the expectation of a ratio of logs of two different distributions, this commonly known metric is very sensitive to what happens in the tails of the distribution. It\u2019s not well-suited to data shift testing since it\u2019s easily disturbed, is not interpretable, and struggles with data in different ranges.

                        2. KS Statistic: This metric is defined as the max distance between CDFs, which is easy to interpret and is thus used widely in practice. Say yes to the KS statistic!

                        3. D1 Distance: Defined as the sum of distances between PDFs, this is a metric used at Google. Despite seeming less principled, it\u2019s easily interpretable and has the added benefit of knowing Google uses it (so why not you?).

                      An open area of research is understanding the impact of differing drift patterns on distance metrics and model performance. Another open area of research is high-dimensional distance metrics. Some options here are:

                      1. Maximum mean discrepancy

                      2. Performing multiple 1D comparisons across the data: While suffering from the multiple hypothesis testing problem, this is a practical approach.

                      3. Prioritize some features for 1D comparisons: Another option is to avoid testing all the features and only focus on those that merit comparison; for example, those features you know may have shifted in the data.

                      4. Projections: In this approach, large data points are put through a dimensionality reduction process and then subject to a two-sample statistical test. Reducing the dimensionality with a domain-specific approach (e.g., mean pixel value for images, length of sentence) is recommended.

                      At a high level, this entire distance metric work aims to identify not just a score for any data shift but also understand its impact on the model. While choosing a metric can be complicated with all the possible options, you should focus on understanding your model\u2019s robustness in a post-deployment scenario.

                      "},{"location":"spring2021/lecture-11/#5-how-do-you-tell-if-a-change-is-bad","title":"5 - How Do You Tell If A Change Is Bad?","text":"

                      There\u2019s no hard and fast rule for finding if a change in the data is bad. An easy option is to set thresholds on the test values. Don\u2019t use a statistical test like the KS test, as they are too sensitive to small shifts. Other options include setting manual ranges, comparing values over time, or even applying an unsupervised model to detect outliers. In practice, fixed rules and specified ranges of test values are used most in practice.

                      "},{"location":"spring2021/lecture-11/#6-tools-for-monitoring","title":"6 - Tools For Monitoring","text":"

                      There are three categories of tools useful for monitoring:

                      1. System monitoring tools like AWS CloudWatch, Datadog, New Relic, and honeycomb test traditional performance metrics

                      2. Data quality tools like Great Expectations, Anomalo, and Monte Carlo test if specific windows of data violate rules or assumptions.

                      3. ML monitoring tools like Arize, Fiddler, and Arthur can also be useful, as they specifically test models.

                      "},{"location":"spring2021/lecture-11/#7-evaluation-store","title":"7 - Evaluation Store","text":"

                      Monitoring is more central to ML than for traditional software.

                      • In traditional SWE, most bugs cause loud failures, and the data that is monitored is most valuable to detect and diagnose problems. If the system is working well, the data from these metrics and monitoring systems may not be useful.

                      • In machine learning, however, monitoring plays a different role. First off, bugs in ML systems often lead to silent degradations in performance. Furthermore, the data that is monitored in ML is literally the code used to train the next iteration of models.

                      Because monitoring is so essential to ML systems, tightly integrating it into the ML system architecture brings major benefits. In particular, better integrating and monitoring practices, or creating an evaluation store, can close the data flywheel loop, a concept we talked about earlier in the class.

                      As we build models, we create a mapping between data and model. As the data changes and we retrain models, monitoring these changes doesn\u2019t become an endpoint--it becomes a part of the entire model development process. Monitoring, via an evaluation store, should touch all parts of your stack. One challenge that this process helps solve is effectively choosing which data points to collect, store, and label. Evaluation stores can help identify which data to collect more points for based on uncertain performance. As more data is collected and labeled, efficient retraining can be performed using evaluation store guidance.

                      "},{"location":"spring2021/lecture-11/#conclusion","title":"Conclusion","text":"

                      In summary, make sure to monitor your models!

                      • Something will always go wrong, and you should have a system to catch errors.

                      • Start by looking at data quality metrics and system metrics, as they are easiest.

                      • In a perfect world, the testing and monitoring should be linked, and they should help you close the data flywheel.

                      • There will be a lot of tooling and research that will hopefully come soon!

                      "},{"location":"spring2021/lecture-12/","title":"Lecture 12: Research Directions","text":""},{"location":"spring2021/lecture-12/#video","title":"Video","text":""},{"location":"spring2021/lecture-12/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-12/#notes","title":"Notes","text":"

                      Download notes as PDF

                      Lecture by Pieter Abbeel. Notes transcribed by James Le and Vishnu Rachakonda.

                      Of all disciplines, deep learning is probably the one where research and practice are closest together. Often, something gets invented in research and is put into production in less than a year. Therefore, it\u2019s good to be aware of research trends that you might want to incorporate in projects you are working on.

                      Because the number of ML and AI papers increases exponentially, there\u2019s no way that you can read every paper. Thus, you need other methods to keep up with research. This lecture provides a sampling of research directions, the overall research theme running across these samples, and advice on keeping up with the relentless flood of new research.

                      "},{"location":"spring2021/lecture-12/#1-unsupervised-learning","title":"1 - Unsupervised Learning","text":"

                      Deep supervised learning, the default way of doing ML, works! But it requires so much annotated data. Can we get around it by learning with fewer labels? The answer is yes! And there are two major approaches: deep semi-supervised learning and deep unsupervised learning.

                      "},{"location":"spring2021/lecture-12/#deep-semi-supervised-learning","title":"Deep Semi-Supervised Learning","text":"

                      Semi-supervised means half supervised, half unsupervised. Assuming a classification problem where each data point belongs to one of the classes, we attempt to come up with an intuition to complete the labeling for the unlabeled data points. One way to formalize this is: If anything is close to a labeled example, then it will assume that label. Thus, we can propagate the labels out from where they are given to the neighboring data points.

                      How can we generalize the approach above to image classification?

                      Xie et al. (2020) proposes Noisy Student Training:

                      • First, they train a teacher model with labeled data.

                      • Then, they infer pseudo-labels on the unlabeled data. These are not real labels, but those that they get from using the trained teacher model.

                      • Even though these labels are not perfect (because they train on a small amount of labeled data), they can still see where they are more confident about those pseudo labels and inject those into their training set as additional labeled data.

                      • When they retrain, they use dropout, data augmentation, and stochastic depth to inject noise into the training process. This enables the student model to be more robust and generalizable.

                      "},{"location":"spring2021/lecture-12/#deep-unsupervised-learning","title":"Deep Unsupervised Learning","text":"

                      Deep semi-supervised learning assumes that the labels in the supervised dataset are still valid for the unsupervised dataset. There\u2019s a limit to the applicability because we assume that the unlabeled data is roughly from the same distribution as the labeled data.

                      With deep unsupervised learning, we can transfer the learning with multi-headed networks.

                      • First, we train a neural network. Then, we have two tasks and give the network two heads - one for task 1 and another for task 2.

                      • Most parameters live in the shared trunk of the network\u2019s body. Thus, when you train for task 1 and task 2, most of the learnings are shared. Only a little bit gets specialized to task 1 versus task 2.

                      The key hypothesis here is that: For task 1 (which is unsupervised), if the neural network is smart enough to do things like predicting the next word in a sentence, generating realistic images, or translating images from one scale to another; then that same neural network is ready to do deep supervised learning from a very small dataset for task 2 (what we care about).

                      "},{"location":"spring2021/lecture-12/#gpt-2","title":"GPT-2","text":"

                      For instance, task 1 could be predicting the next word in a sentence, while task 2 could be predicting the sentiment in a corpus. OpenAI\u2019s GPT-2 is the landmark result for next-word prediction where deep unsupervised learning could work. The results were so realistic, and there was a lot of press coverage. OpenAI deemed it to be too dangerous to be released at the time.

                      Furthermore, GPT-2 can tackle complex common sense reasoning and question answering tasks for various benchmarks. The table below displays those benchmarks where GPT-2 was evaluated on. The details of the tasks do not really matter. What\u2019s more interesting is that: This is the first time a model, trained unsupervised on a lot of text to predict the next token and fine-tuned to specific supervised tasks, beats prior methods that might have been more specialized to each of these supervised tasks.

                      Another fascinating insight is that as we grow the number of model parameters, the performance goes up consistently. This means with unsupervised learning, we can incorporate much more data for larger models. This research funding inspired OpenAI to fundraise $1B for future projects to essentially have more compute available to train larger models because it seems like doing that will lead to better results. So far, that has been true (GPT-3 performs better than GPT-2).

                      "},{"location":"spring2021/lecture-12/#bert","title":"BERT","text":"

                      BERT is Google\u2019s approach that came out around the same time as GPT-2. While GPT-2 predicts the next word or token, BERT predicts a word or token that was removed. In this task, the neural network looks at the entire corpus as it fills things back in, which often helps in later tasks (as the neural network has already been unsupervised-train on the entire text).

                      The table below displays BERT\u2019s performance on the GLUE benchmark. The takeaway message is not so much in the details of these supervised tasks; but the fact that these tasks have a relatively small amount of labeled data compared to the unsupervised training that happens ahead of time. As BERT outperformed all SOTA methods, it revolutionized how natural language processing should be done.

                      BERT is one of the biggest updates that Google has made since RankBrain in 2015 and has proven successful in comprehending the intent of the searcher behind a search query.

                      "},{"location":"spring2021/lecture-12/#unsupervised-learning-in-vision","title":"Unsupervised Learning In Vision","text":"

                      Can we do the same thing for vision tasks? Let\u2019s explore a few of them.

                      • Predict A Missing Patch: A patch is high-dimensional, so the number of possibilities in that patch is very high (much larger than the number of words in English, for instance). Therefore, it\u2019s challenging to predict precisely and make that work as well as in languages.

                      • Solve Jigsaw Puzzles: If the network can do this, it understands something about images of the world. The trunk of the network should hopefully be reusable.

                      • Predict Rotation: Here, you collect random images and predict what degree has been rotated. Existing methods work immensely well for such a task.

                      A technique that stood out in recent times is contrastive learning, which includes two variants - SimCLR (Chen et al., 2020) and MoCo (He et al., 2019). Here\u2019s how you train your model with contrastive learning:

                      • Imagine that you download two images of a dog and a cat from the Internet, and you don\u2019t have labels yet.

                      • You duplicate the dog image and make two versions of it (a greyscale version and a cropped version).

                      • For these two dog versions, the neural network should bring them together while pushing the cat image far away.

                      You then fine-tune with a simple linear classifier on top of training completely unsupervised. This means that you must get the right features extracted from the images during training. The results of contrastive learning methods confirm that the higher the number of model parameters, the better the accuracy.

                      "},{"location":"spring2021/lecture-12/#2-reinforcement-learning","title":"2 - Reinforcement Learning","text":"

                      Reinforcement learning (RL) has not been practical yet but nevertheless has shown promising results. In RL, the AI is an agent, more so than just a pattern recognizer. The agent acts in an environment where it is goal-oriented. It wants to achieve something during the process, which is represented by a reward function.

                      "},{"location":"spring2021/lecture-12/#challenges","title":"Challenges","text":"

                      Compared to unsupervised learning, RL brings about a host of additional challenges:

                      • Credit assignment: When the RL agent sees something, it has to take action. But it is not told whether the action was good or bad right away.

                      • Stability: Because the RL agent learns by trial and error, it can destabilize and make big mistakes. Thus, it needs to be clever in updating itself not to destroy things along the way.

                      • Exploration: The RL agent has to try things that have not been done before.

                      Despite these challenges, some great RL successes have happened.

                      "},{"location":"spring2021/lecture-12/#successes","title":"Successes","text":"

                      DeepMind has shown that neural networks can learn to play the Atari game back in 2013. Under the hood is the Deep Q-Network architecture, which was trained from its own trial-and-error, looking at the score in the game to internalize what actions might be good or bad.

                      The game of Go was cracked by DeepMind - showing that the computer can play better than the best human player (AlphaGo, AlphaGoZero, and AlphaZero).

                      RL also works for the robot locomotion task. You don\u2019t have to design the controller yourself. You just implement the RL algorithm (TRPO, GAE, DDPG, PPO, and more) and let the agent train itself, which is a general approach to have AI systems acquire new skills. In fact, the robot can acquire such a variety of skills, as demonstrated in this DeepMimic work.

                      You can also accomplish the above for non-human-like characters in dynamic animation tasks. This is going to change how you can design video games or animated movies. Instead of designing the keyframes for every step along the way in your video or your game, you can train an agent to go from point A to point B directly.

                      RL has been shown to work on real robots.

                      • BRETT (Berkeley Robot for the Elimination of Tedious Tasks) could learn to put blocks into matching openings in under an hour using a neural network trained from scratch. This technique has been used for NASA SuperBall robots for space exploration ideas.

                      • A similar idea was applied to robotic manipulation solving Rubik\u2019s cube, done at OpenAI in 2019. The in-hand manipulation is a very difficult robotic control problem that was mastered with RL.

                      "},{"location":"spring2021/lecture-12/#covariantai","title":"CovariantAI","text":"

                      The fact that RL worked so well actually inspired Pieter and his former students (Tianhao Zhang, Rocky Duan, and Peter Chen) to start a company called Covariant in 2017. Their goal is to bring these advances from the lab into the real world. An example is autonomous order picking.

                      "},{"location":"spring2021/lecture-12/#3-unsupervised-reinforcement-learning","title":"3 - Unsupervised Reinforcement Learning","text":"

                      RL achieved mastery on many simulated domains. But we must ask the question: How fast is the learning itself? Tsividis et al., 2017 shows that a human can learn in about 15 minutes to perform better than Double DQN (a SOTA approach at the time of the study) learned after 115 hours.

                      How can we bridge this learning gap?

                      Based on the 2018 DeepMind Control Suite, pixel-based learning needs 50M more training steps than state-based learning to solve the same tasks. Maybe we can develop an unsupervised learning approach to turn pixel-level representations (which are not that informative) into a new representation that is much more similar to the underlying state.

                      CURL brings together contrastive learning and RL.

                      • In RL, there\u2019s typically a replay buffer where we store the past experiences. We load observations from there and feed them into an encoder neural network. The network has two heads: an actor to estimate the best action to take next and a critic to estimate how good that action would be.

                      • CURL adds an extra head at the bottom, which includes augmented observations, and does contrastive learning on that. Similar configurations of the robot are brought closer together, while different ones are separated.

                      The results confirm that CURL can match existing SOTA approaches that learn from states and from pixels. However, it struggles in hard environments, with insufficient labeled images being the root cause.

                      "},{"location":"spring2021/lecture-12/#4-meta-reinforcement-learning","title":"4 - Meta Reinforcement Learning","text":"

                      The majority of fully general RL algorithms work well for any environments that can be mathematically defined. However, environments encountered in the real world are a tiny subset of all environments that could be defined. Maybe the learning takes such a long time because the algorithms are too general. If they are a bit more specialized in things they will encounter, perhaps the learning is faster.

                      Can we develop a fast RL algorithm to take advantage of this?

                      In traditional RL research, human experts develop the RL algorithm. However, there are still no RL algorithms nearly as good as humans after many years. Can we learn a better RL algorithm? Or even learn a better entire agent?

                      "},{"location":"spring2021/lecture-12/#rl2","title":"RL^2","text":"

                      RL^2 (Duan et al., 2016) is a meta-RL framework proposed to tackle this issue:

                      • Imagine that we have multiple meta-training environments (A, B, and so on).

                      • We also have a meta-RL algorithm that learns the RL algorithm and outputs a \u201cfast\u201d RL agent (from having interacted with these environments).

                      • In the future, our agent will be in an environment F that is related to A, B, and so on.

                      Formally speaking, RL^2 maximizes the expected reward on the training Markov Decision Process (MDP) but can generalize to testing MDP. The RL agent is represented as a Recurrent Neural Network (RNN), a generic computation architecture where:

                      • Different weights in the RNN mean different RL algorithms and priors.

                      • Different activations in the RNN mean different current policies.

                      • The meta-trained objective can be optimized with an existing \u201cslow\u201d RL algorithm.

                      • The resulting RNN is ready to be dropped in a new environment.

                      RL^2 was evaluated on a classic Multi-Armed Bandit setting and performed better than provably (asymptotically) optimal RL algorithms invented by humans like Gittings Index, UCB1, and Thompson Sampling. Another task that RL^2 was evaluated on is visual navigation, where the agent explores a maze and finds a specified target as quickly as possible. Although this setting is maze-specific, we can scale up RL^2 to other large-scale games and robotic environments and use it to learn in a new environment quickly.

                      "},{"location":"spring2021/lecture-12/#learn-more","title":"Learn More","text":"
                      • Schmidhuber. Evolutionary principles in self-referential learning. (1987)

                      • Wiering, Schmidhuber. Solving POMDPs with Levin search and EIRA. (1996)

                      • Schmidhuber, Zhao, Wiering. Shifting inductive bias with success-story algorithm, adaptive Levin search, and incremental self-improvement. (MLJ 1997)

                      • Schmidhuber, Zhao, Schraudolph. Reinforcement learning with self-modifying policies (1998)

                      • Zhao, Schmidhuber. Solving a complex prisoner\u2019s dilemma with self-modifying policies. (1998)

                      • Schmidhuber. A general method for incremental self-improvement and multiagent learning. (1999)

                      • Singh, Lewis, Barto. Where do rewards come from? (2009)

                      • Singh, Lewis, Barto. Intrinsically Motivated Reinforcement Learning: An Evolutionary Perspective (2010)

                      • Niekum, Spector, Barto. Evolution of reward functions for reinforcement learning (2011)

                      • Wang et al., (2016). Learning to Reinforcement Learn

                      • Finn et al., (2017). Model-Agnostic Meta-Learning (MAML)

                      • Mishra, Rohinenjad et al., (2017). Simple Neural AttentIve Meta-Learner

                      • Frans et al., (2017). Meta-Learning Shared Hierarchies

                      "},{"location":"spring2021/lecture-12/#5-few-shot-imitation-learning","title":"5 - Few-Shot Imitation Learning","text":"

                      People often complement RL with imitation learning, which is basically supervised learning where the output is an action for an agent. This gives you more signal than traditional RL since for every input, you consistently have a corresponding output. As the diagram below shows, the imitation learning algorithm learns a policy in a supervised manner from many demonstrations and outputs the correct action based on the environment.

                      The challenge for imitation learning is to collect enough demonstrations to train an algorithm, which is time-consuming. To make the collection of demonstrations more efficient, we can apply multi-task meta-learning. Many demonstrations for different tasks can be learned by an algorithm, whose output is fed to a one-shot imitator that picks the correct action based on a single demonstration. This process is referred to as one-shot imitation learning (Duan et al., 2017), as displayed below.

                      Conveniently, one-shot imitators are trained using traditional network architectures. A combination of CNNs, RNNs, and MLPs perform the heavy visual processing to understand the relevant actions in training demos and recommend the right action for the current frame of an inference demo. One example of this in action is block stacking.

                      "},{"location":"spring2021/lecture-12/#learn-more_1","title":"Learn More","text":"
                      • Abbeel et al., (2008). Learning For Control From Multiple Demonstrations

                      • Kolter, Ng. The Stanford LittleDog: A Learning And Rapid Replanning Approach To Quadrupled Locomotion (2008)

                      • Ziebart et al., (2008). Maximum Entropy Inverse Reinforcement Learning

                      • Schulman et al., (2013). Motion Planning with Sequential Convex Optimization and Convex Collision Checking

                      • Finn, Levine. Deep Visual Foresight for Planning Robot Motion (2016)

                      "},{"location":"spring2021/lecture-12/#6-domain-randomization","title":"6 - Domain Randomization","text":"

                      Simulated data collection is a logical substitute for expensive real data collection. It is less expensive, more scalable, and less dangerous (e.g., in the case of robots) to capture at scale. Given this logic, how can we make sure simulated data best matches real-world conditions?

                      "},{"location":"spring2021/lecture-12/#use-realistic-simulated-data","title":"Use Realistic Simulated Data","text":"

                      One approach is to make the simulator you use for training models as realistic as possible. Two variants of doing this are to carefully match the simulation to the world (James and John, 2016; Johns, Leutenegger, and Division, 2016; Mahler et al., 2017; Koenemann et al., 2015) and augment simulated data with real data (Richter et al., 2016; Bousmalis et al., 2017). While this option is logically appealing, it can be hard and slow to do in practice.

                      "},{"location":"spring2021/lecture-12/#domain-confusion","title":"Domain Confusion","text":"

                      Another option is domain confusion (Tzeng et al., 2014; Rusu et al., 2016).

                      • In this approach, suppose you train a model on real and simulated data at the same time.

                      • After completing training, a discriminator network examines the original network at some layer to understand if the original network is learning something about the real world.

                      • If you can fool the discriminator with the output of the layer, the original network has completely integrated its understanding of real and simulated data.

                      • In effect, there is no difference between simulated and real data to the original network, and the layers following the examined layer can be trained fully on simulated data.

                      "},{"location":"spring2021/lecture-12/#domain-randomization","title":"Domain Randomization","text":"

                      Finally, a simpler approach called domain randomization (Tobin et al., 2017; Sadeghi and Levine, 2016) has taken off of late. In this approach, rather than making simulated data fully realistic, the priority is to generate as much variation in the simulated data as possible. For example, in the below tabletop scenes, the dramatic variety of the scenes (e.g., background colors of green and purple) can help the model generalize well to the real world, even though the real world looks nothing like these scenes. This approach has shown promise in drone flight and pose estimation. The simple logic of more data leading to better performance in real-world settings is powerfully illustrated by domain randomization and obviates the need for existing variation methods like pre-training on ImageNet.

                      "},{"location":"spring2021/lecture-12/#7-deep-learning-for-science-and-engineering","title":"7 - Deep Learning For Science and Engineering","text":""},{"location":"spring2021/lecture-12/#alphafold","title":"AlphaFold","text":"

                      In other areas of this lecture, we\u2019ve been focusing on research areas of machine learning where humans already perform well (i.e., pose estimation or grasping). In science and engineering applications, we enter the realm of machine learning performing tasks humans cannot. The most famous result is AlphaFold, a Deepmind-created system that solved protein folding, an important biological challenge. In the CASP challenge, AlphaFold 2 far outpaced all other results in performance. AlphaFold is quite complicated, as it maps an input protein sequence to similar protein sequences and subsequently decides the folding structure based on the evolutionary history of complementary amino acids.

                      Other examples of DL systems solving science and engineering challenges are in circuit design, high-energy physics, and symbolic mathematics.

                      "},{"location":"spring2021/lecture-12/#learn-more_2","title":"Learn More","text":"
                      • AlphaFold: Improved protein structure prediction using potentials from deep learning. Deepmind (Senior et al.)

                      • BagNet: Berkeley Analog Generator with Layout Optimizer Boosted with Deep Neural Networks. K. Hakhamaneshi, N. Werblun, P. Abbeel, V. Stojanovic. IEEE/ACM International Conference on Computer-Aided Design (ICAD), Westminster, Colorado, November 2019.

                      • Evaluating Protein Transfer Learning with TAPE. R. Rao, N. Bhattacharya, N. Thomas, Y, Duan, X. Chen, J. Canny, P. Abbeel, Y. Song.

                      • Opening the black box: the anatomy of a deep learning atomistic potential. Justin Smith

                      • Exploring Machine Learning Applications to Enable Next-Generation Chemistry. Jennifer Wei (Google).

                      • GANs for HEP. Ben Nachman

                      • Deep Learning for Symbolic Mathematics. G. Lample and F. Charton.

                      • A Survey of Deep Learning for Scientific Discovery. Maithra Raghu, Eric Schmidt.

                      "},{"location":"spring2021/lecture-12/#8-overarching-research-theme","title":"8 - Overarching Research Theme","text":"

                      As compute scales to support incredible numbers of FLOPs, more science and engineering challenges will be solved with deep learning systems. There has been exponential growth in the amount of compute used to generate the most impressive research results like GPT-3.

                      As compute and data become more available, we open a new problem territory that we can refer to as deep learning to learn. More specifically, throughout history, the constraint on solving problems has been human ingenuity. This is a particularly challenging realm to contribute novel results to because we\u2019re competing against the combined intellectual might available throughout history. Is our present ingenuity truly greater than that of others 20-30 years ago, let alone 200-300? Probably not. However, our ability to bring new tools like compute and data most certainly is. Therefore, spending as much time in this new problem territory, where data and compute help solve problems, is likely to generate exciting and novel results more frequently in the long run.

                      "},{"location":"spring2021/lecture-12/#9-how-to-keep-up","title":"9 - How To Keep Up","text":"

                      \u201cGive a man a fish and you feed him for a day, teach a man to fish and you feed him for a lifetime\u201d (Lao Tzu)

                      Here are some tips on how to keep up with ML research:

                      • (Mostly) don\u2019t read (most) papers. There are just too many!

                      • When you do want to keep up, use the following:

                        • Tutorials at conferences: these capture the essence of important concepts in a practical, distilled way

                        • Graduate courses and seminars

                        • Yannic Kilcher YouTube channel

                        • Two Minutes Paper Channel

                        • The Batch by Andrew Ng

                        • Import AI by Jack Clark

                      • If you DO decide to read papers,

                        • Follow a principled process for reading papers

                        • Use Arxiv Sanity

                        • Twitter

                        • AI/DL Facebook Group

                        • ML Subreddit

                        • Start a reading group: read papers together with friends - either everyone reads then discusses, or one or two people read and give tutorials to others.

                      Finally, should you do a Ph.D. or not?

                      • You don\u2019t have to do a Ph.D. to work in AI!

                      • However, if you REALLY want to become one of the world\u2019s experts in a topic you care about, then a Ph.D. is a technically deep and demanding path to get there. Crudely speaking, a Ph.D. enables you to develop new tools and techniques rather than using existing tools and techniques.

                      "},{"location":"spring2021/lecture-13/","title":"Lecture 13: ML Teams and Startups","text":""},{"location":"spring2021/lecture-13/#video","title":"Video","text":""},{"location":"spring2021/lecture-13/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-13/#notes","title":"Notes","text":"

                      Download notes as PDF

                      Lecture by Josh Tobin. Notes transcribed by James Le and Vishnu Rachakonda.

                      Over the past few years, machine learning (ML) has grown tremendously. But as young as ML is as a discipline, the craft of managing an ML team is even younger. Many of today\u2019s ML managers were thrust into management roles out of necessity or because they were the best individual contributors, and many come from purely academic backgrounds. At some companies, engineering or product leaders are tasked with building new ML functions without real ML experience.

                      Running any technical team is hard:

                      • You have to hire great people.

                      • You need to manage and develop them.

                      • You need to manage your team\u2019s output and make sure your vectors are aligned.

                      • You would want to make good long-term technical choices and manage technical debt.

                      • You also must manage expectations from leadership.

                      Running an ML team is even harder:

                      • ML talents are expensive and scarce.

                      • ML teams have a diverse set of roles.

                      • ML projects have unclear timelines and high uncertainty.

                      • ML is also the \u201chigh-interest credit card of technical debt.\"

                      • Leadership often doesn\u2019t understand ML.

                      The goals of this lecture are two-fold: (1) to give you insight into how to think about building and managing ML teams (as a leader); and (2) to help you get a job in ML (as a newcomer).

                      "},{"location":"spring2021/lecture-13/#1-ml-roles","title":"1 - ML Roles","text":""},{"location":"spring2021/lecture-13/#common-roles","title":"Common Roles","text":"

                      Let\u2019s take a look at the most common ML roles and the skills they require:

                      • The ML Product Manager works with the ML team, other business functions, the end-users, and the data owners. This person designs documentation, creates wireframes, and comes up with the plan to prioritize and execute ML projects.

                      • The DevOps Engineer deploys and monitors production systems. This person handles the infrastructure that runs the deployed ML product using platforms like AWS or GCP.

                      • The Data Engineer builds data pipelines, aggregates and collects data from storage, and monitors data behavior. This person works with distributed systems using tools such as Hadoop, Kafka, Airflow.

                      • The ML Engineer trains and deploys prediction models. This person uses tools like TensorFlow and Docker to work with prediction systems running on real data in production.

                      • The ML Researcher trains prediction models, often those that are forward-looking or not production-critical. This person uses libraries like TensorFlow and PyTorch on notebook environments to build models and reports describing their experiments.

                      • The Data Scientist is a blanket term used to describe all of the roles above. In some organizations, this role entails answering business questions via analytics. He/she can work with wide-ranging tools from SQL and Excel to Pandas and Scikit-Learn.

                      "},{"location":"spring2021/lecture-13/#skills-required","title":"Skills Required","text":"

                      So what skills are needed for these roles? The chart above displays a nice visual, where the horizontal axis is the level of ML expertise and the size of the bubble is the level of communication and technical writing (the bigger, the better).

                      • The ML DevOps is primarily a software engineering role, which often comes from a standard software engineering pipeline.

                      • The Data Engineer belongs to the software engineering team that works actively with ML teams.

                      • The ML Engineer requires a rare mix of ML and Software Engineering skills. This person is either an engineer with significant self-teaching OR a science/engineering Ph.D. who works as a traditional software engineer after graduate school.

                      • The ML Researcher is an ML expert who usually has an MS or Ph.D. degree in Computer Science or Statistics or finishes an industrial fellowship program.

                      • The ML Product Manager is just like a traditional Product Manager, but with a deep knowledge of the ML development process and mindset.

                      • The Data Scientist role constitutes a wide range of backgrounds from undergraduate to Ph.D. students.

                      "},{"location":"spring2021/lecture-13/#2-ml-organizations","title":"2 - ML Organizations","text":""},{"location":"spring2021/lecture-13/#organization-archetypes","title":"Organization Archetypes","text":"

                      There exists not yet a consensus on the right way to structure an ML team. Still, a few best practices are contingent upon different organization archetypes and their ML maturity level. First, let\u2019s see what the different ML organization archetypes are.

                      "},{"location":"spring2021/lecture-13/#archetype-1-nascent-and-ad-hoc-ml","title":"Archetype 1 - Nascent and Ad-hoc ML","text":"
                      • These are organizations where no one is doing ML, or ML is done on an ad-hoc basis. Obviously, there is little ML expertise in-house.

                      • They are either small-to-medium businesses or less technology-forward large companies in industries like education or logistics.

                      • There is often low-hanging fruit for ML.

                      • But there is little support for ML projects, and it\u2019s challenging to hire and retain good talent.

                      "},{"location":"spring2021/lecture-13/#archetype-2-research-and-development-ml","title":"Archetype 2 - Research and Development ML","text":"
                      • These are organizations in which ML efforts are centered in the R&D arm of the organization. They often hire ML researchers and doctorate students with experience publishing papers.

                      • They are larger companies in sectors such as oil and gas, manufacturing, or telecommunications.

                      • They can hire experienced researchers and work on long-term business priorities to get big wins.

                      • However, it is very difficult to get quality data. Most often, this type of research work rarely translates into actual business value, so usually, the amount of investment remains small.

                      "},{"location":"spring2021/lecture-13/#archetype-3-product-embedded-ml","title":"Archetype 3 - Product-Embedded ML","text":"
                      • These are organizations where certain product teams or business units have ML expertise alongside their software or analytics talent. These ML individuals report up to the team\u2019s engineering/tech lead.

                      • They are either software companies or financial services companies.

                      • ML improvements are likely to lead to business value. Furthermore, there is a tight feedback cycle between idea iteration and product improvement.

                      • Unfortunately, it is still very hard to hire and develop top talent, and access to data and compute resources can lag. There are also potential conflicts between ML project cycles and engineering management, so long-term ML projects can be hard to justify.

                      "},{"location":"spring2021/lecture-13/#archetype-4-independent-ml-division","title":"Archetype 4 - Independent ML Division","text":"
                      • These are organizations in which the ML division reports directly to senior leadership. The ML Product Managers work with Researchers and Engineers to build ML into client-facing products. They can sometimes publish long-term research.

                      • They are often large financial services companies.

                      • Talent density allows them to hire and train top practitioners. Senior leaders can marshal data and compute resources. This gives the organizations to invest in tooling, practices, and culture around ML development.

                      • A disadvantage is that model handoffs to different business lines can be challenging since users need the buy-in to ML benefits and get educated on the model use. Also, feedback cycles can be slow.

                      "},{"location":"spring2021/lecture-13/#archetype-5-ml-first","title":"Archetype 5 - ML-First","text":"
                      • These are organizations in which the CEO invests in ML, and there are experts across the business focusing on quick wins. The ML division works on challenging and long-term projects.

                      • They are large tech companies and ML-focused startups.

                      • They have the best data access (data thinking permeates the organization), the most attractive recruiting funnel (challenging ML problems tends to attract top talent), and the easiest deployment procedure (product teams understand ML well enough).

                      • This type of organization archetype is hard to implement in practice since it is culturally difficult to embed ML thinking everywhere.

                      "},{"location":"spring2021/lecture-13/#team-structure-design-choices","title":"Team Structure Design Choices","text":"

                      Depending on the above archetype that your organization resembles, you can make the appropriate design choices, which broadly speaking follow these three categories:

                      1. Software Engineer vs. Research: To what extent is the ML team responsible for building or integrating with software? How important are Software Engineering skills on the team?

                      2. Data Ownership: How much control does the ML team have over data collection, warehousing, labeling, and pipelining?

                      3. Model Ownership: Is the ML team responsible for deploying models into production? Who maintains the deployed models?

                      Below are our design suggestions:

                      If your organization focuses on ML R&D:

                      • Research is most definitely prioritized over Software Engineering skills. Because of this, there would potentially be a lack of collaboration between these two groups.

                      • ML team has no control over the data and typically will not have data engineers to support them.

                      • ML models are rarely deployed into production.

                      If your organization has ML embedded into the product:

                      • Software Engineering skills will be prioritized over Research skills. Often, the researchers would need strong engineering skills since everyone would be expected to product-ionize his/her models.

                      • ML teams generally do not own data production and data management. They will need to work with data engineers to build data pipelines.

                      • ML engineers totally own the models that they deploy into production.

                      If your organization has an independent ML division:

                      • Each team has a potent mix of engineering and research skills; therefore, they work closely together within teams.

                      • ML team has a voice in data governance discussions, as well as a robust data engineering function.

                      • ML team hands-off models to users but is still responsible for maintaining them.

                      If your organization is ML-First:

                      • Different teams are more or less research-oriented, but in general, research teams collaborate closely with engineering teams.

                      • ML team often owns the company-wide data infrastructure.

                      • ML team hands the models to users, who are responsible for operating and maintaining them.

                      The picture below neatly sums up these suggestions:

                      "},{"location":"spring2021/lecture-13/#3-managing-ml-teams","title":"3 - Managing ML Teams","text":""},{"location":"spring2021/lecture-13/#managing-ml-teams-is-challenging","title":"Managing ML Teams Is Challenging","text":"

                      The process of actually managing an ML team is quite challenging for four reasons:

                      1. Engineering Estimation: It\u2019s hard to know how easy or hard an ML project is in advance. As you explore the data and experiment with different models, there is enormous scope for new learnings about the problem that materially impact the timeline. Furthermore, knowing what methods will work is often impossible. This makes it hard to say upfront how long or how much work may go into an ML project.

                      2. Nonlinear Progress: As the chart below from a blog post by Lukas Biewald (CEO of Weights and Biases) shows, progress on ML projects is unpredictable over time, even when the effort expended grows considerably. It\u2019s very common for projects to stall for extended periods of time.

                      1. Cultural gaps: The relative culture of engineering and research professionals is very different. Research tends to favor novel, creative ideas, while engineering prefers tried and true methods that work. As a result, ML teams often experience a clash of cultures, which can turn toxic if not appropriately managed. A core challenge of running ML teams is addressing the cultural barriers between ML and software engineering so that teams can harmoniously experiment and deliver ML products.

                      2. Leadership Deficits: It\u2019s common to see a lack of detailed understanding of ML at senior levels of management in many companies. As a result, expressing feasibility and setting the right expectations for ML projects, especially high-priority ones, can be hard.

                      "},{"location":"spring2021/lecture-13/#how-to-manage-ml-teams-better","title":"How To Manage ML Teams Better","text":"

                      Managing ML teams is hardly a solved problem, but you can take steps to improve the process.

                      "},{"location":"spring2021/lecture-13/#plan-probabilistically","title":"Plan Probabilistically","text":"

                      Many engineering projects are managed in a waterfall fashion, with the sequential tasks defined up front clearly. Instead of forcing this method of engineering management on difficult ML projects, try assigning a likelihood of success to different tasks to better capture the experimental process inherent to ML engineering. As these tasks progress or stall, rapidly re-evaluate your task ordering to better match what is working. Having this sense of both (1) how likely a task is to succeed and (2) how important it is makes project planning considerably more realistic.

                      "},{"location":"spring2021/lecture-13/#have-a-portfolio-of-approaches","title":"Have A Portfolio Of Approaches","text":"

                      Embrace multiple ideas and approaches to solve crucial research challenges that gate production ML. Don\u2019t make your plan dependent on one approach working!

                      "},{"location":"spring2021/lecture-13/#measure-inputs-not-results","title":"Measure Inputs, Not Results","text":"

                      As you work through several approaches in your portfolio, do not overly emphasize whose ideas ultimately work as a reflection of contribution quality. This can negatively impact team members\u2019 creativity, as they focus more on trying to find only what they currently think could work, rather than experimenting in a high-quality fashion (which is ultimately what leads to ML success).

                      "},{"location":"spring2021/lecture-13/#have-researchers-and-engineers-work-together","title":"Have Researchers and Engineers Work Together","text":"

                      The collaboration between engineering and research is essential for quality ML products to get into production. Emphasize collaboration across the groups and professionals!

                      "},{"location":"spring2021/lecture-13/#get-end-to-end-pipelines-together-quickly-to-demonstrate-quick-wins","title":"Get End-to-end Pipelines Together Quickly to Demonstrate Quick Wins","text":"

                      Taking this approach makes it more likely that your ML project will succeed in the long term. It allows you to demonstrate progress to your leadership more effectively and clearly.

                      "},{"location":"spring2021/lecture-13/#educate-leadership-on-ml-timeline-uncertainty","title":"Educate Leadership on ML Timeline Uncertainty","text":"

                      This can be hard, as leadership is ultimately accountable for addressing blind spots and understanding timeline risk. There are things you can do, however, to help improve leadership\u2019s knowledge about ML timelines. Avoid building hype around narrow progress metrics material only to the ML team (e.g., \u201cWe improved F1 score by 0.2 and have achieved awesome performance!\u201d). Instead, be realistic, communicate risk, and emphasize real product impact (e.g., \u201cOur model improvements should increase the number of conversions by 10%, though we must continue to validate its performance on additional demographic factors.) Sharing resources like this a16z primer and this class from Prof. Pieter Abbeel can increase awareness of your company\u2019s leadership.

                      "},{"location":"spring2021/lecture-13/#4-hiringgetting-hired","title":"4 - Hiring/Getting Hired","text":""},{"location":"spring2021/lecture-13/#the-ai-talent-gap","title":"The AI Talent Gap","text":"

                      With the novelty of ML systems, it\u2019s fair to say that not many people have built real ML systems. Estimates vary from as few as 10,000 (Element AI) to as many as 200-300,000 people (Tencent). Whatever way you slice the numbers (contained in this blog post), the reality is that there is not much-experienced talent in the AI/ML field, especially compared to the number of trained software developers in the US (3.6M) or in the world (18.2M).

                      "},{"location":"spring2021/lecture-13/#sourcing","title":"Sourcing","text":"

                      Because of this shallow talent pool and the skyrocketing demand, hiring for ML positions is pretty hard. Typical ML roles come in the following structure:

                      • ML Adjacent roles: ML product manager, DevOps, Data Engineer

                      • Core ML Roles: ML Engineer, ML Research/ML Scientist

                      • Business analytics roles: Data Scientist

                      For ML adjacent roles, traditional ML knowledge is less important, as demonstrated interest, conversational understanding, and experience can help these professionals play an impactful role on ML teams. Let\u2019s focus on how to hire for the core ML roles.

                      While there\u2019s no perfect way to hire ML engineers, there\u2019s definitely a wrong way to hire them, with extensive job descriptions that demand only the best qualifications. Certainly, there are many good examples of this bad practice floating around.

                      • Rather than this unrealistic process, consider hiring for software engineering skills, an interest in ML, and a desire to learn. You can always train people in the art and science of ML, especially when they come with strong software engineering fundamentals.

                      • Another option is to consider adding junior talent, as many recent grads come out with good ML knowledge nowadays.

                      • Finally, and most importantly, be more specific about what you need the position and professional to do. It\u2019s impossible to find one person that can do everything from full-fledged DevOps to algorithm development.

                      To hire ML researchers, here are our tips:

                      • Evaluate the quality of publications, over the quantity, with an eye towards the originality of the ideas, the execution, etc.

                      • Prioritize researchers that focus on important problems instead of trendy problems.

                      • Experience outside academia is also a positive, as these researchers may be able to transition to industry more effectively.

                      • Finally, keep an open mind about research talent and consider talented people without PhDs or from adjacent fields like physics, statistics, etc.

                      To find quality candidates for these roles, some ideas for sourcing are:

                      • To experiment with standard job recruiting avenues like LinkedIn, Hired, recruiters, on-campus-recruiting, etc.

                      • To monitor arXiv and top conferences and flag first authors of papers you like.

                      • To look for good implementations of papers you like.

                      • To attend ML research conferences (NeurIPS, ICML, etc.)

                      As you seek to recruit, stay on top of what professionals want and make an effort to position your company accordingly. ML practitioners want to be empowered to do great work with interesting data. Building a culture of learning and impact can help recruit the best talent to your team. Additionally, sell sell sell! Talent needs to know how good your team is and how meaningful the mission can be.

                      "},{"location":"spring2021/lecture-13/#interviewing","title":"Interviewing","text":"

                      As you interview candidates for ML roles, try to validate your hypotheses of their strengths while testing a minimum bar on weaker aspects. For example, make sure ML researchers can think creatively about new ML problems while ensuring they meet a baseline for code quality. It\u2019s essential to test both ML knowledge and software engineering skill for all industry professionals, though the relative strengths can vary.

                      The actual ML interview process is much less well-defined than software engineering interviews, though it is modeled off of it. Some helpful inclusions are projects or exercises that test the ability to work with ML-specific code, like take-home ML projects.

                      "},{"location":"spring2021/lecture-13/#finding-a-job","title":"Finding A Job","text":"

                      To find an ML job, you can take a look at the following sources:

                      • Standard sources such as LinkedIn, recruiters, on-campus recruiting, etc.

                      • ML research conferences (NeurIPS, ICLR, ICML).

                      • Apply directly (remember, there\u2019s a talent gap!).

                      Standing out for competitive roles can be tricky! Here are some tips in increasing order of impressiveness that you can apply to differentiate yourself:

                      1. Build software engineering skills (e.g., at a well-known software company).

                      2. Exhibit ML interest (e.g., conference attendance, online courses certificates, etc.).

                      3. Show you have a broad knowledge of ML (e.g., write blog posts synthesizing a research area).

                      4. Demonstrate ability to get ML projects done (e.g., create side projects, re-implement papers).

                      5. Prove you can think creatively in ML (e.g., win Kaggle competitions, publish papers).

                      As you prepare for interviews, prepare for both the traditional ML theoretical topics and the general software engineering interview (e.g., read Cracking the Coding Interview).

                      "},{"location":"spring2021/lecture-13/#5-conclusion","title":"5 - Conclusion","text":"

                      Being a new and evolving discipline for most traditional organizations, forming ML teams is full of known and unknown challenges. Here are the final few take-homes:

                      • There are many different skills involved in production ML, so there are opportunities for many people to contribute.

                      • ML teams are becoming more standalone and more interdisciplinary.

                      • Managing ML teams is complex. There is no silver bullet, but shifting toward probabilistic planning can help.

                      • ML talent is scarce. As a manager, be specific about what skills are must-have in the ML job descriptions. As a job seeker, it can be brutally challenging to break in as an outsider, so use projects as a signal to build awareness.

                      "},{"location":"spring2021/lecture-2a/","title":"Lecture 2A: CNNs","text":""},{"location":"spring2021/lecture-2a/#video","title":"Video","text":""},{"location":"spring2021/lecture-2a/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-2a/#notes","title":"Notes","text":"

                      Lecture by Sergey Karayev.

                      In this video, we first review convolution operation, the most basic property of Convolutional Neural Networks. Then, we look at other important operations for ConvNets. Finally, we transition to looking at a classic ConvNet architecture called LeNet.

                      • 00:00 - Introduction
                      • 01:08 - Convolutional Filters
                      • 07:10 - Filter Stacks and ConvNets
                      • 11:25 - Strides and Padding
                      • 14:35 - Filter Math
                      • 21:44 - Convolution Implementation Notes
                      • 24:04 - Increasing the Receptive Field with Dilated Convolutions
                      • 27:30 - Decreasing the Tensor Size with Pooling and 1x1-Convolutions
                      • 30:54 - LeNet Architecture
                      "},{"location":"spring2021/lecture-2b/","title":"Lecture 2B: Computer Vision","text":""},{"location":"spring2021/lecture-2b/#video","title":"Video","text":""},{"location":"spring2021/lecture-2b/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-2b/#notes","title":"Notes","text":"

                      Lecture by Sergey Karayev.

                      In this video, we will review notable applications of deep learning in computer vision. First, we will tour some ConvNet architectures. Then, we will talk about localization, detection, and segmentation problems. We will conclude with more advanced methods.

                      Learn more at this website: https://paperswithcode.com/area/computer-vision

                      • 00:00 - Introduction
                      • 02:51 - AlexNet
                      • 05:09 - ZFNet
                      • 06:54 - VGGNet
                      • 09:06 - GoogLeNet
                      • 11:57 - ResNet
                      • 15:15 - SqueezeNet
                      • 17:05 - Architecture Comparisons
                      • 20:00 - Localization, Detection, and Segmentation Tasks
                      • 24:00 - Overfeat, YOLO, and SSD Methods
                      • 28:01 - Region Proposal Methods (R-CNN, Faster R-CNN, Mask R-CNN, U-Net)
                      • 34:33 - Advanced Tasks (3D Shape Inference, Face Landmark Recognition, and Pose Estimation)
                      • 37:00 - Adversarial Attacks
                      • 40:56 - Style Transfer
                      "},{"location":"spring2021/lecture-3/","title":"Lecture 3: RNNs","text":""},{"location":"spring2021/lecture-3/#video","title":"Video","text":""},{"location":"spring2021/lecture-3/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-3/#notes","title":"Notes","text":"

                      Lecture by Josh Tobin.

                      • 00:00 - Introduction
                      • 01:34 - Sequence Problems
                      • 06:28 - Review of RNNs
                      • 22:00 - Vanishing Gradient Issue
                      • 27:52 - LSTMs and Its Variants
                      • 34:10 - Bidirectionality and Attention from Google's Neural Machine Translation
                      • 46:38 - CTC Loss
                      • 52:12 - Pros and Cons of Encoder-Decoder LSTM Architectures
                      • 54:55 - WaveNet
                      "},{"location":"spring2021/lecture-4/","title":"Lecture 4: Transformers","text":""},{"location":"spring2021/lecture-4/#video","title":"Video","text":""},{"location":"spring2021/lecture-4/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-4/#notes","title":"Notes","text":"

                      Lecture by Sergey Karayev.

                      In this video, you will learn about the origin of transfer learning in computer vision, its application in NLP in the form of embedding, NLP's ImageNet moment, and the Transformers model families.

                      • 00:00 - Introduction
                      • 00:42 - Transfer Learning in Computer Vision
                      • 04:00 - Embeddings and Language Models
                      • 10:09 - NLP's ImageNet moment: ELMO and ULMFit on datasets like SQuAD, SNLI, and GLUE
                      • 16:49 - Rise of Transformers
                      • 18:20 - Attention in Detail: (Masked) Self-Attention, Positional Encoding, and Layer Normalization
                      • 27:33 - Transformers Variants: BERT, GPT/GPT-2/GPT-3, DistillBERT, T5, etc.
                      • 36:20 - GPT3 Demos
                      • 42:53 - Future Directions
                      "},{"location":"spring2021/lecture-5/","title":"Lecture 5: ML Projects","text":"

                      Learn how to set up Machine Learning projects like a pro. This includes an understanding of the ML lifecycle, an acute mind of the feasibility and impact, an awareness of the project archetypes, and an obsession with metrics and baselines.

                      "},{"location":"spring2021/lecture-5/#video","title":"Video","text":""},{"location":"spring2021/lecture-5/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-5/#notes","title":"Notes","text":"

                      Lecture by Josh Tobin. Notes transcribed by James Le and Vishnu Rachakonda.

                      "},{"location":"spring2021/lecture-5/#1-why-do-ml-projects-fail","title":"1 - Why Do ML Projects Fail?","text":"

                      Based on a report from TechRepublic a few years back, despite increased interest in adopting machine learning (ML) in the enterprise, 85% of machine learning projects ultimately fail to deliver on their intended promises to business. Failure can happen for many reasons; however, a few glaring dangers will cause any AI project to crash and burn.

                      • ML is still very much a research endeavor. Therefore it is very challenging to aim for a 100% success rate.

                      • Many ML projects are technically infeasible or poorly scoped.

                      • Many ML projects never leap production, thus getting stuck at the prototype phase.

                      • Many ML projects have unclear success criteria because of a lack of understanding of the value proposition.

                      • Many ML projects are poorly managed because of a lack of interest from leadership.

                      "},{"location":"spring2021/lecture-5/#2-lifecycle","title":"2 - Lifecycle","text":"

                      It\u2019s essential to understand what constitutes all of the activities in a machine learning project. Typically speaking, there are four major phases:

                      1. Planning and Project Setup: At this phase, we want to decide the problem to work on, determine the requirements and goals, figure out how to allocate resources properly, consider the ethical implications, etc.

                      2. Data Collection and Labeling: At this phase, we want to collect training data and potentially annotate them with ground truth, depending on the specific sources where they come from. We may find that it\u2019s too hard to get the data, or it might be easier to label for a different task. If that\u2019s the case, go back to phase 1.

                      3. Model Training and Model Debugging: At this phase, we want to implement baseline models quickly, find and reproduce state-of-the-art methods for the problem domain, debug our implementation, and improve the model performance for specific tasks. We may realize that we need to collect more data or that labeling is unreliable (thus, go back to phase 2). Or we may recognize that the task is too challenging and there is a tradeoff between project requirements (thus, go back to phase 1).

                      4. Model Deploying and Model Testing: At this phase, we want to pilot the model in a constrained environment (i.e., in the lab), write tests to prevent regressions, and roll the model into production. We may see that the model doesn\u2019t work well in the lab, so we want to keep improving the model\u2019s accuracy (thus, go back to phase 3). Or we may want to fix the mismatch between training data and production data by collecting more data and mining hard cases (thus go back to phase 2). Or we may find out that the metric picked doesn\u2019t actually drive downstream user behavior, and/or performance in the real world isn\u2019t great. In such situations, we want to revisit the projects\u2019 metrics and requirements (thus, go back to phase 1).

                      Besides the per-project activities mentioned above, there are two other things that any ML team will need to solve across any projects they get involved with: (1) building the team and hiring people; and (2) setting up infrastructure and tooling to build ML systems repeatedly and at scale.

                      Additionally, it might be useful to understand state-of-the-art results in your application domain so that you know what\u2019s possible and what to try next.

                      "},{"location":"spring2021/lecture-5/#3-prioritizing-projects","title":"3 - Prioritizing Projects","text":"

                      To prioritize projects to work on, you want to find high-impact problems and assess the potential costs associated with them. The picture below shows a general framework that encourages us to target projects with high impact and high feasibility.

                      "},{"location":"spring2021/lecture-5/#high-impact","title":"High Impact","text":"

                      There are no silver bullets to find high-impact ML problems to work on, but here are a few useful mental models:

                      • Where can you take advantage of cheap prediction?

                      • Where is there friction in your product?

                      • Where can you automate complicated manual processes?

                      • What are other people doing?

                      "},{"location":"spring2021/lecture-5/#cheap-prediction","title":"Cheap Prediction","text":"

                      In the book \u201cPrediction Machines,\u201d the authors (Ajay Agrawal, Joshua Gans, and Avi Goldfarb) come up with an excellent mental model on the economics of Artificial Intelligence: As AI reduces the cost of prediction and prediction is central for decision making, cheap predictions would be universal for problems across business domains. Therefore, you should look for projects where cheap predictions will have a huge business impact.

                      "},{"location":"spring2021/lecture-5/#product-needs","title":"Product Needs","text":"

                      Another lens is to think about what your product needs. In the article \u201cThree Principles for Designing ML-Powered Products,\u201d the Spotify Design team emphasizes the importance of building ML from a product perspective and looking for parts of the product experience with high friction. Automating those parts is exactly where there is a lot of impact for ML to make your business better.

                      "},{"location":"spring2021/lecture-5/#ml-strength","title":"ML Strength","text":"

                      In his popular blog post \u201cSoftware 2.0,\u201d Andrej Karpathy contrasts software 1.0 (which are traditional programs with explicit instructions) and software 2.0 (where humans specify goals, while the algorithm searches for a program that works). Software 2.0 programmers work with datasets, which get compiled via optimization\u200a\u2014\u200awhich works better, more general, and less computationally expensive. Therefore, you should look for complicated rule-based software where we can learn the rules instead of programming them.

                      "},{"location":"spring2021/lecture-5/#inspiration-from-others","title":"Inspiration From Others","text":"

                      Instead of reinventing the wheel, you can look at what other companies are doing. In particular, check out papers from large frontier organizations (Google, Facebook, Nvidia, Netflix, etc.) and blog posts from top earlier-stage companies (Uber, Lyft, Spotify, Stripe, etc.).

                      Here is a list of excellent ML use cases to check out (credit to Chip Huyen\u2019s ML Systems Design Lecture 2 Note):

                      • Human-Centric Machine Learning Infrastructure at Netflix (Ville Tuulos, InfoQ 2019)

                      • 2020 state of enterprise machine learning (Algorithmia, 2020)

                      • Using Machine Learning to Predict Value of Homes On Airbnb (Robert Chang, Airbnb Engineering & Data Science, 2017)

                      • Using Machine Learning to Improve Streaming Quality at Netflix (Chaitanya Ekanadham, Netflix Technology Blog, 2018)

                      • 150 Successful Machine Learning Models: 6 Lessons Learned at Booking.com (Bernardi et al., KDD, 2019)

                      • How we grew from 0 to 4 million women on our fashion app, with a vertical machine learning approach (Gabriel Aldamiz, HackerNoon, 2018)

                      • Machine Learning-Powered Search Ranking of Airbnb Experiences (Mihajlo Grbovic, Airbnb Engineering & Data Science, 2019)

                      • From shallow to deep learning in fraud (Hao Yi Ong, Lyft Engineering, 2018)

                      • Space, Time and Groceries (Jeremy Stanley, Tech at Instacart, 2017)

                      • Creating a Modern OCR Pipeline Using Computer Vision and Deep Learning (Brad Neuberg, Dropbox Engineering, 2017)

                      • Scaling Machine Learning at Uber with Michelangelo (Jeremy Hermann and Mike Del Balso, Uber Engineering, 2019)

                      • Spotify\u2019s Discover Weekly: How machine learning finds your new music (Sophia Ciocca, 2017)

                      "},{"location":"spring2021/lecture-5/#high-feasibility","title":"High Feasibility","text":"

                      The three primary cost drivers of ML projects in order of importance are data availability, accuracy requirement, and problem difficulty.

                      "},{"location":"spring2021/lecture-5/#data-availability","title":"Data Availability","text":"

                      Here are the questions you need to ask concerning the data availability:

                      • How hard is it to acquire data?

                      • How expensive is data labeling?

                      • How much data will be needed?

                      • How stable is the data?

                      • What are the data security requirements?

                      "},{"location":"spring2021/lecture-5/#accuracy-requirement","title":"Accuracy Requirement","text":"

                      Here are the questions you need to ask concerning the accuracy requirement:

                      • How costly are wrong predictions?

                      • How frequently does the system need to be right to be useful?

                      • What are the ethical implications?

                      It is worth noting that ML project costs tend to scale super-linearly in the accuracy requirement. The fundamental reason is that you typically need a lot more data and more high-quality labels to achieve high accuracy numbers.

                      "},{"location":"spring2021/lecture-5/#problem-difficulty","title":"Problem Difficulty","text":"

                      Here are the questions you need to ask concerning the problem difficulty:

                      • Is the problem well-defined?

                      • Is there good published work on similar problems?

                      • What are the computing requirements?

                      • Can a human do it?

                      So what\u2019s still hard in machine learning? As a caveat, it\u2019s historically very challenging to predict what types of problems will be difficult for ML to solve in the future. But generally speaking, both unsupervised learning and reinforcement learning are still hard, even though they show promise in limited domains where tons of data and compute are available.

                      Zooming into supervised learning, here are three types of hard problems:

                      • Output is complex: These are problems where the output is high-dimensional or ambiguous. Examples include 3D reconstruction, video prediction, dialog systems, open-ended recommendation systems, etc.

                      • Reliability is required: These are problems where high precision and robustness are required. Examples include systems that can fail safely in out-of-distribution scenarios, is robust to adversarial attacks, or needs to tackle highly precise tasks.

                      • Generalization is required: These are problems with out-of-distribution data or in the domains of reasoning, planning, and causality. Examples include any systems for self-driving vehicles or any systems that deal with small data.

                      Finally, this is a nice checklist for you to run an ML feasibility assessment:

                      • Are you sure that you need ML at all?

                      • Put in the work upfront to define success criteria with all of the stakeholders.

                      • Consider the ethics of using ML.

                      • Do a literature review.

                      • Try to build a labeled benchmark dataset rapidly.

                      • Build a minimal viable product with manual rules

                      • Are you \u201creally sure\u201d that you need ML at all?

                      "},{"location":"spring2021/lecture-5/#4-archetypes","title":"4 - Archetypes","text":"

                      So far, we\u2019ve talked about the lifecycle and the impact of all machine learning projects. Ultimately, we generally want these projects, or applications of machine learning, to be useful for products. As we consider how ML can be applied in products, it\u2019s helpful to note that there are common machine learning product archetypes or recurrent patterns through which machine learning is applied to products. You can think of these as \u201cmental models\u201d you can use to assess your project and easily prioritize the needed resources.

                      There are three common archetypes in machine learning projects: Software 2.0, Human-in-the-loop, and autonomous systems. They are shown in the table below, along with common examples and questions. We\u2019ll dive deeper into each.

                      Archetype Examples Questions Software 2.0

                      - Improve code completion in IDE

                      - Build customized recommendation system

                      - Build a better video game AI

                      - Do your models truly improve performance?

                      - Does performance improvement generate business value?

                      - Do performance improvements lead to a data flywheel?

                      Human-in-the-loop

                      - Turn sketches into slides

                      - Email auto-completion

                      - Help radiologists do job faster

                      - How good does the system need to be to be useful?

                      - How can you collect enough data to make it good?

                      Autonomous Systems

                      - Full self-driving

                      - Automated customer support

                      - Automated website design

                      - What is an acceptable failure rate for the system?

                      - How can you guarantee that it won\u2019t exceed the failure rate?

                      - How inexpensively can you label data from the system?

                      "},{"location":"spring2021/lecture-5/#software-20","title":"Software 2.0","text":"

                      Software 2.0, which we previously alluded to from the Karpathy article, is defined as \u201caugmenting existing rules-based or deterministic software with machine learning, a probabilistic approach.\u201d Examples of this are taking a code completer in an IDE and improving the experience for the user by adding an ML component. Rather than suggesting a command based solely on the leading characters the programmer has written, you might add a model that suggests commands based on previous commands the programmer has written.

                      As you build a software 2.0 project, strongly consider the concept of the data flywheel. For certain ML projects, as you improve your model, your product will get better and more users will engage with the product, thereby generating more data for the model to get even better. It\u2019s a classic virtuous cycle and truly the gold standard for ML projects.

                      In embarking on creating a data flywheel, critically consider where the model could fail in relation to your product. For example, do more users lead to collecting more data that is useful for improving your model? An actual system needs to be set up to capture this data and ensure that it's meaningful for the ML lifecycle. Furthermore, consider whether more data will lead to a better model (your job as an ML practitioner) or whether a better model and better predictions will actually lead to making the product better. Ideally, you should have a quantitative assessment of what makes your product \u201cbetter\u201d and map model improvement to it.

                      "},{"location":"spring2021/lecture-5/#human-in-the-loop-hil","title":"Human-in-the-Loop (HIL)","text":"

                      HIL systems are defined as machine learning systems where the output of your model will be reviewed by a human before being executed in the real world. For example, consider translating sketches into slides. An ML algorithm can take a sketch\u2019s input and suggest to a user a particular slide design. Every output of the ML model is considered and executed upon by a human, who ultimately has to decide on the slide\u2019s design.

                      "},{"location":"spring2021/lecture-5/#autonomous-systems","title":"Autonomous Systems","text":"

                      Autonomous systems are defined as machine learning systems where the system itself makes decisions or engages in outputs that are almost never reviewed by a human. Canonically, consider the self-driving car!

                      "},{"location":"spring2021/lecture-5/#feasibility","title":"Feasibility","text":"

                      Let\u2019s discuss how the product archetypes relate back to project priority. In terms of feasibility and impact, the two axes on which we consider priority, software 2.0 tends to have high feasibility but potentially low impact. The existing system is often being optimized rather than wholly replaced. However, this status with respect to priority is not static by any means. Building a data flywheel into your software 2.0 project can improve your product\u2019s impact by improving the model\u2019s performance on the task and future ones.

                      In the case of human-in-the-loop systems, their feasibility and impact sit squarely in between autonomous systems and software 2.0. HIL systems, in particular, can benefit disproportionately in their feasibility and impact from effective product design, which naturally takes into account how humans interact with technology and can mitigate risks for machine learning model behavior. Consider how the Facebook photo tagging algorithm is implemented. Rather than tagging the user itself, the algorithm frequently asks the user to tag themselves. This effective product design allows the model to perform more effectively in the user\u2019s eye and reduces the impact of false classifications. Grammarly similarly solicits user input as part of its product design through offering explanations. Finally, recommender systems also implement this idea. In general, good product design can smooth the rough edges of ML (check out the concept of designing collaborative AI).

                      There are industry-leading resources that can help you merge product design and ML. Apple\u2019s ML product design guidelines suggest three key questions to anyone seeking to put ML into a product:

                      1. What role does ML play in your product?

                      2. How can you learn from your users?

                      3. How should your app handle mistakes?

                      Associated with each question is a set of design paradigms that help address the answers to each question. There are similarly great resources from Microsoft and Spotify.

                      Finally, autonomous systems can see their priority improved by improving their feasibility. Specifically, you can add humans in the loop or reduce the system\u2019s natural autonomy to improve its feasibility. In the case of self-driving cars, many companies add safety drivers as guardrails to improve autonomous systems. In Voyage\u2019s case, they take a more dramatic approach of constraining the problem for the autonomous system: they only run self-driving cars in senior living communities, a narrow subset of the broader self-driving problem.

                      "},{"location":"spring2021/lecture-5/#5-metrics","title":"5 - Metrics","text":"

                      So far, we\u2019ve talked about the overall ideas around picking projects and structuring them based on their archetypes and the specific considerations that go into them. Now, we\u2019ll shift gears and be a little more tactical to focus on metrics and baselines, which will help you execute projects more effectively.

                      "},{"location":"spring2021/lecture-5/#choosing-a-metric","title":"Choosing a Metric","text":"

                      Metrics help us evaluate models. There\u2019s a delicate balance between the real world (which is always messy and multifaceted) and the machine learning paradigm (which optimizes a single metric) in choosing a metric. In practical production settings, we often care about multiple dimensions of performance (i.e., accuracy, speed, cost, etc.). The challenge is to reconcile all the possible evaluation methods with the reality that ML systems work best at optimizing a single number. How can we balance these competing needs in building an ML product?

                      As you start evaluating models, choose a single metric to focus on first, such as precision, accuracy, recall, etc. This can serve as an effective first filter of performance. Subsequently, you can put together a formula that combines all the metrics you care about. Note that it\u2019s important to be flexible and regularly update this formula as your models or the requirements for the product change.

                      "},{"location":"spring2021/lecture-5/#combining-metrics","title":"Combining Metrics","text":"

                      Two simple ways of combining metrics into a formula are averaging and thresholding.

                      Averaging is less common but easy and intuitive; you can just take a simple average or a weighted average of the model\u2019s metrics and pick the highest average.

                      More practically, you can apply a threshold evaluation to the model\u2019s metrics. In this method, out of n evaluation metrics, you threshold n-1 and optimize the nth metric. For example, if we look at a model\u2019s precision, memory requirement, and cost to train, we might threshold the memory requirement (no more than X MB) and the cost (no more than $X) and optimize precision (as high as possible). As you choose which metrics to threshold and what to set their threshold values to, make sure to consider domain-specific needs and the actual values of the metrics (how good/bad they might be).

                      "},{"location":"spring2021/lecture-5/#6-baselines","title":"6 - Baselines","text":"

                      In any product development process, setting expectations properly is vital. For machine learning products, baselines help us set expectations for how well our model will perform. In particular, baselines set a useful lower bound for our model\u2019s performance. What\u2019s the minimum expectation we should have for a model\u2019s performance? The better defined and clear the baseline is, the more useful it is for setting the right expectations. Examples of baselines are human performance on a similar task, state-of-the-art models, or even simple heuristics.

                      Baselines are especially important for helping decide the next steps. Consider the example below of two models with the same loss curve but differing performance with respect to the baseline. Clearly, they require different action items! As seen below, on the left, where we are starting to approach or exceed the baseline, we need to be mindful of overfitting and perhaps incorporate regularization of some sort. On the right, where the baseline hugely exceeds our model\u2019s performance, we clearly have a lot of work to do to improve the model and address its underfitting.

                      There are a number of sources to help us define useful baselines. Broadly speaking, there are external baselines (baselines defined by others) or internal baselines you can define yourself. With internal baselines, in particular, you don\u2019t need anything too complicated, or even something with ML! Simple tests like averaging across your dataset can help you understand if your model is achieving meaningful performance. If your model can\u2019t exceed a simple baseline like this, you might need to really re-evaluate the model.

                      Human baselines are a particularly powerful form of baseline since we often seek to replace or augment human actions. In creating these baselines, note that there\u2019s usually an inverse relationship between the quality of the baseline and the ease of data collection. In a nutshell, the harder it is to get a human baseline, the better and more useful it probably is.

                      For example, a Mechanical Turk-created baseline is easy to generate nowadays, but the quality might be hit or miss because of the variance in annotators. However, trained, specialized annotators can be hard to acquire, but the specificity of their knowledge translates into a great baseline. Choosing where to situate your baseline on this range, from low quality/easy to high quality/hard, depends on the domain. Concentrating data collection strategically, ideally in classes where the model is least performant, is a simple way of improving the quality of the baseline.

                      "},{"location":"spring2021/lecture-5/#tldr","title":"TLDR","text":"
                      1. Machine learning projects are iterative. Deploy something fast to begin the cycle.

                      2. Choose projects with high impact and low cost of wrong predictions.

                      3. The secret sauce to make projects work well is to build automated data flywheels.

                      4. In the real world, you care about many things, but you should always have just one to work on.

                      5. Good baselines help you invest your effort the right way.

                      "},{"location":"spring2021/lecture-5/#further-resources","title":"Further Resources","text":"
                      • Andrew Ng\u2019s \u201cMachine Learning Yearning\u201d

                      • Andrej Kaparthy\u2019s \u201cSoftware 2.0\u201d

                      • Agrawal, Gans, and Goldfarb\u2019s \u201cThe Economics of AI\u201d

                      • Chip Huyen\u2019s \u201cIntroduction to Machine Learning Systems Design\u201d

                      • Apple\u2019s \u201cHuman-Interface Guidelines for Machine Learning\u201d

                      • Google\u2019s \u201cRules of Machine Learning\u201d

                      "},{"location":"spring2021/lecture-6/","title":"Lecture 6: MLOps Infrastructure & Tooling","text":""},{"location":"spring2021/lecture-6/#video","title":"Video","text":""},{"location":"spring2021/lecture-6/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-6/#notes","title":"Notes","text":"

                      Lecture by Sergey Karayev. Notes transcribed by James Le and Vishnu Rachakonda.

                      "},{"location":"spring2021/lecture-6/#1-dream-vs-reality-for-ml-practitioners","title":"1 - Dream vs. Reality for ML Practitioners","text":"

                      The dream of ML practitioners is that we are provided the data, and somehow we build an optimal machine learning prediction system available as a scalable API or an edge deployment. That deployment then generates more data for us, which can be used to improve our system.

                      The reality is that you will have to:

                      • Aggregate, process, clean, label, and version the data
                      • Write and debug model code
                      • Provision compute
                      • Run many experiments and review the results
                      • Discover that you did something wrong or maybe try a different architecture -> Write more code and provision more compute
                      • Deploy the model when you are happy
                      • Monitor the predictions that the model makes on production data so that you can gather some good examples and feed them back to the initial data flywheel loop

                      For example, the slide above is from Andrej Karpathy\u2019s talk at PyTorch Devcon 2019 discussing Tesla\u2019s self-driving system. Their dream is to build a system that goes from the data gathered through their training, evaluation, and inference processes and gets deployed on the cars. As people drive, more data will be collected and added back to the training set. As this process repeats, Tesla\u2019s ML engineers can all go on vacation :)

                      The picture above (from the famous Google paper \u201cMachine Learning: The High-Interest Credit Card of Technical Debt\u201d) shows that the ML code portion in a real-world ML system is a lot smaller than the infrastructure needed for its support. As ML projects move from small-scale research experiments to large-scale industry deployments, your organization most likely will require a massive amount of infrastructure to support large inferences, distributed training, data processing pipelines, reproducible experiments, model monitoring, etc.

                      "},{"location":"spring2021/lecture-6/#2-three-buckets-of-tooling-landscape","title":"2 - Three Buckets of Tooling Landscape","text":"

                      We can break down the landscape of all this necessary infrastructure into three buckets: data, training/evaluation, and deployment.

                      • The data bucket includes the data sources, data lakes/warehouses, data processing, data exploration, data versioning, and data labeling.
                      • The training/evaluation bucket includes compute sources, resource management, software engineering, frameworks and distributed training libraries, experiment management, and hyper-parameter tuning.
                      • The deployment bucket includes continuous integration and testing, edge deployment, web deployment, monitoring, and feature store.

                      There are also several vendors offering \u201call-in-one\u201d MLOps solutions that cover all three buckets. This lecture focuses on the training/evaluation bucket.

                      "},{"location":"spring2021/lecture-6/#3-software-engineering","title":"3 - Software Engineering","text":"

                      When it comes to writing deep learning code, Python is the clear programming language of choice. As a general-purpose language, Python is easy to learn and easily accessible, enabling you to find skilled developers on a faster basis. It has various scientific libraries for data wrangling and machine learning (Pandas, NumPy, Scikit-Learn, etc.). Regardless of whether your engineering colleagues write code in a lower-level language like C, C++, or Java, it is generally neat to join different components with a Python wrapper.

                      When choosing your IDEs, there are many options out there (Vim, Emacs, Sublime Text, Jupyter, VS Code, PyCharm, Atom, etc.). Each of these has its uses in any application, and you\u2019re better to switch between them to remain agile without relying heavily on shortcuts and packages. It also helps teams work better if they can jump into different IDEs and comment/collaborate with other colleagues. In particular, Visual Studio Code makes for a very nice Python experience, where you have access to built-in git staging and diffing, peek at documentation, linter code as you write, and open projects remotely.

                      Jupyter Notebooks have rapidly grown in popularity among data scientists to become the standard for quick prototyping and exploratory analysis. For example, Netflix based all of their machine learning workflows on them, effectively building a whole notebook infrastructure to leverage them as a unifying layer for scheduling workflows. Jeremy Howard develops his fast.ai codebase entirely with notebooks and introduces a project called nbdev that shows people how to develop well-tested code in a notebook environment.

                      However, there are many problems with using notebooks as a last resort when working in teams that aim to build machine/deep learning products. Alexander Mueller's blog post outlines the five reasons why they suck:

                      • It is challenging to enable good code versioning because notebooks are big JSON files that cannot be merged automatically.
                      • Notebook \u201cIDE\u201d is primitive, as they have no integration, no lifting, and no code-style correction. Data scientists are not software engineers, and thus, tools that govern their code quality and help improve it are very important.
                      • It is very hard to structure code reasonably, put code into functions, and develop tests while working in notebooks. You better develop Python scripts based on test-driven development principles as soon as you want to reproduce some experiments and run notebooks frequently.
                      • Notebooks have out-of-order execution artifacts, meaning that you can easily destroy your current working state when jumping between cells of notebooks.
                      • It is also difficult to run long or distributed tasks. If you want to handle big datasets, better pull your code out of notebooks, start a Python folder, create fixtures, write tests, and then deploy your application to a cluster.

                      Recently, a new application framework called Streamlit was introduced. The creators of the framework wanted machine learning engineers to be able to create beautiful apps without needing a tools team; in other words, these internal tools should arise as a natural byproduct of the machine learning workflow. According to the launch blog post, here are the core principles of Streamlit:

                      • Embrace Python scripting: Streamlit apps are just scripts that run from top to bottom. There\u2019s no hidden state. You can factor your code with function calls. If you know how to write Python scripts, you can write Streamlit apps.
                      • Treat widgets as variables: There are no callbacks in Streamlit. Every interaction simply reruns the script from top to bottom. This approach leads to a clean codebase.
                      • Reuse data and computation: Streamlit introduces a cache primitive that behaves like a persistent, immutable-by-default data store that lets Streamlit apps safely and effortlessly reuse information.

                      Right now, Streamlit is building features that enable sharing machine learning projects to be as easy as pushing a web app to Heroku.

                      We recommend using conda to set up your Python and CUDA environments and pip-tools to separate mutually compatible versions of all requirements for our lab.

                      "},{"location":"spring2021/lecture-6/#4-compute-hardware","title":"4 - Compute Hardware","text":"

                      We can break down the compute needs into an early-stage development step and a late-stage training/evaluation step.

                      • During the development stage, we write code, debug models, and look at the results. It\u2019d be nice to be able to compile and train models via an intuitive GUI quickly.
                      • During the training/evaluation stage, we design model architecture, search for hyper-parameters, and train large models. It\u2019d be nice to launch experiments and review results easily.

                      Compute matters with each passing year due to the fact that the results came out of deep learning are using more and more compute (check out this 2018 report from OpenAI). Looking at recent Transformer models, while OpenAI\u2019s GPT-3 has not been fully commercialized yet, Google already released the Switch Transformer with orders of magnitude larger in the number of parameters.

                      So should you get your own hardware, go straight to the cloud, or use on-premise options?

                      "},{"location":"spring2021/lecture-6/#gpu-basics","title":"GPU Basics","text":"

                      This is basically an NVIDIA game, as they are the only provider of good deep learning GPUs. However, Google\u2019s TPUs are the fastest, which is available only on GCP.

                      There is a new NVIDIA architecture every year: Kepler -> Pascal -> Volta -> Turing -> Ampere. NVIDIA often released the server version of the cards first, then the \u201centhusiast\u201d version, and finally the consumer version. If you use these cards for business purposes, then you suppose to use the server version.

                      GPUs have a different amount of RAM. You can only compute on the data that is on the GPU memory. The more data you can fit on the GPU, the larger your batches are, the faster your training goes.

                      For deep learning, you use 32-bit precision. In fact, starting with the Volta architecture, NVIDIA developed tensor cores that are specifically designed for deep learning operations (mixed-precision between 32 and 16 bit). Tensor Cores reduce the used cycles needed for calculating multiply and addition operations and the reliance on repetitive shared memory access, thus saving additional cycles for memory access. This is very useful for the convolutional/Transformer models that are prevalent nowadays.

                      Let\u2019s go through different GPU architectures:

                      • Kepler/Maxwell: They are 2-4x slower than the Pascal/Volta ones below. You should not buy these old guards (K80).
                      • Pascal: They are in the 1080 Ti cards from 2017, which are still useful if bought used (especially for recurrent neural networks). P100 is the equivalent cloud offering.
                      • Volta/Turing: These are the preferred choices over the Kepler and Pascal because of their support for 16-bit mixed-precision via tensor cores. Hardware options are 2080 Ti and Titan RTX, while the cloud option is V100.
                      • Ampere: This architecture is available in the latest hardware (3090) and cloud (A100) offerings. They have the most tensor cores, leading to at least 30% speedup over Turing.

                      You can check out this recent GPU benchmark from Lambda Labs and consult Tim Dettmers\u2019 advice on which GPUs to get.

                      "},{"location":"spring2021/lecture-6/#cloud-options","title":"Cloud Options","text":"

                      Amazon Web Services, Google Cloud Platform, and Microsoft Azure are the cloud heavyweights with largely similar functions and prices. There are also startups like Lambda Labs and Corewave that provide cloud GPUs.

                      "},{"location":"spring2021/lecture-6/#on-prem-options","title":"On-Prem Options","text":"

                      You can either build your own or buy pre-built devices from vendors like Lambda Labs, NVIDIA, Supermicro, Cirrascale, etc.

                      "},{"location":"spring2021/lecture-6/#recommendations","title":"Recommendations","text":"

                      Even though the cloud is expensive, it\u2019s hard to make on-prem devices scale past a certain point. Furthermore, dev-ops things are easier to be done in the cloud than to be set up by yourself. And if your machine dies or requires maintenance, that will be a constant headache if you are responsible for managing it.

                      Here are our recommendations for three profiles:

                      • Hobbyists: Build your own machine (maybe a 4x Turing or a 2x Ampere PC) during development. Either use the same PC or use cloud instances during training/evaluation.
                      • Startups: Buy a sizeable Lambda Labs machine for every ML scientist during development. Buy more shared server machines or use cloud instances during training/evaluation.
                      • Larger companies: Buy an even more powerful machine for every ML scientist during development. Use cloud with fast instances with proper provisioning and handling of failures during training/evaluation.
                      "},{"location":"spring2021/lecture-6/#5-resource-management","title":"5 - Resource Management","text":"

                      With all the resources we have discussed (compute, dependencies, etc.), our challenge turns to manage them across the specific use cases we may have. Across all the resources, our goal is always to be able to easily experiment with the necessary resources to achieve the desired application of ML for our product.

                      For this challenge of allocating resources to experimenting users, there are some common solutions:

                      1. Script a solution ourselves: In theory, this is the simplest solution. We can check if a resource is free and then lock it if a particular user is using it or wants to.
                      2. SLURM: If we don't want to write the script entirely ourselves, standard cluster job schedulers like SLURM can help us. The workflow is as follows: First, a script defines a job\u2019s requirements. Then, the SLURM queue runner analyzes this and then executes the jobs on the correct resource.
                      3. Docker/Kubernetes: The above approach might still be too manual for your needs, in which case you can turn to Docker/Kubernetes. Docker packages the dependency stack into a lighter-than-VM package called a container (that excludes the OS). Kubernetes lets us run these Docker containers on a cluster. In particular, Kubeflow is an OSS project started by Google that allows you to spawn/manage Jupyter notebooks and manage multi-step workflows. It also has lots of plug-ins for extra processes like hyperparameter tuning and model deployment. However, Kubeflow can be a challenge to setup.
                      4. Custom ML software: There\u2019s a lot of novel work and all-in-one solutions being developed to provision compute resources for ML development efficiently. Platforms like AWS Sagemaker, Paperspace Gradient, and Determined AI are advancing. Newer startups like Anyscale and Grid.AI (creators of PyTorch Lightning) are also tackling this. Their vision is around allowing you to seamlessly go from training models on your computer to running lots of training jobs in the cloud with a simple set of SDK commands.
                      "},{"location":"spring2021/lecture-6/#6-frameworks-and-distributed-training","title":"6 - Frameworks and Distributed Training","text":""},{"location":"spring2021/lecture-6/#deep-learning-frameworks","title":"Deep Learning Frameworks","text":"

                      If you\u2019ve built a deep learning model in the last few years, you\u2019ve probably used a deep learning framework. Frameworks like TensorFlow have crucially shaped the development of the deep learning revolution. The reality is that deep learning frameworks have existed for a while. Projects like Theano and Torch have been around for 10+ years. In contemporary use, there are three main frameworks we\u2019ll focus on - TensorFlow, Keras, and PyTorch. We evaluate frameworks based on their utility for production and development.

                      When TensorFlow came out in 2015, it was billed heavily as a production-optimized DL framework with an underlying static optimized graph that could be deployed across compute environments. However, TF 1.0 had a pretty unpleasant development experience; in addition to developing your models, you had to consider the underlying execution graph you were describing. This kind of \u201cmeta-development\u201d posed a challenge for newcomers. The Keras project solved many of these issues by offering a simpler way to define models, and eventually became a part of TensorFlow. PyTorch, when it was introduced in 2017, offered a polar opposite to TensorFlow. It made development super easy by consisting almost exclusively of simple Python commands, but was not designed to be fast at scale.

                      Using TF/Keras or PyTorch is the current recommended way to build deep learning models unless you have a powerful reason not to. Essentially, both have converged to pretty similar points that balance development and production. TensorFlow adopted eager execution by default and became a lot easier to develop quickly in. PyTorch subsumed Caffe2 and became much faster as a result, specifically by adding the ability to compile speedier model artifacts. Nowadays, PyTorch has a lot of momentum, likely due to its ease of development. Newer projects like fast.ai and PyTorch Lighting add best practices and additional functionality to PyTorch, making it even more popular. According to this 2018 article on The Gradient, more than 80% of submissions are in PyTorch in academic projects.

                      All these frameworks may seem like excessive quibbling, especially since PyTorch and TensorFlow have converged in important ways. Why do we even require such extensive frameworks?

                      It\u2019s theoretically possible to define entire models and their required matrix math (e.g., a CNN) in NumPy, the classic Python numerical computing library. However, we quickly run into two challenges: back-propagating errors through our model and running the code on GPUs, which are powerful computation accelerators. For these issues to be addressed, we need frameworks to help us with auto-differentiation, an efficient way of computing the gradients, and software compatibility with GPUs, specifically interfacing with CUDA. Frameworks allow us to abstract the work required to achieve both features, while also layering in valuable abstractions for all the latest layer designs, optimizers, losses, and much more. As you can imagine, the abstractions offered by frameworks save us valuable time on getting our model to run and allow us to focus on optimizing our model.

                      New projects like JAX and HuggingFace offer different or simpler abstractions. JAX focuses primarily on fast numerical computation with autodiff and GPUs across machine learning use cases (not just deep learning). HuggingFace abstracts entire model architectures in the NLP realm. Instead of loading individual layers, HuggingFace lets you load the entirety of a contemporary mode (along with weights)l like BERT, tremendously speeding up development time. HuggingFace works on both PyTorch and TensorFlow.

                      "},{"location":"spring2021/lecture-6/#distributed-training","title":"Distributed Training","text":"

                      Distributed training is a hot topic as the datasets and the models we train become too large to work on a single GPU. It\u2019s increasingly a must-do. The important thing to note is that distributed training is a process to conduct a single model training process; don\u2019t confuse it with training multiple models on different GPUs. There are two approaches to distributed training: data parallelism and model parallelism.

                      "},{"location":"spring2021/lecture-6/#data-parallelism","title":"Data Parallelism","text":"

                      Data parallelism is quite simple but powerful. If we have a batch size of X samples, which is too large for one GPU, we can split the X samples evenly across N GPUs. Each GPU calculates the gradients and passes them to a central node (either a GPU or a CPU), where the gradients are averaged and backpropagated through the distributed GPUs. This paradigm generally results in a linear speed-up time (e.g., two distributed GPUs results in a ~2X speed-up in training time). In modern frameworks like PyTorch, PyTorch Lightning, and even in schedulers like SLURM, data-parallel training can be achieved simply by specifying the number of GPUs or calling a data parallelism-enabling object (e.g., torch.nn.DataParallel). Other tools like Horovod (from Uber) use non-framework-specific ways of enabling data parallelism (e.g., MPI, a standard multiprocessing framework). Ray, the original open-source project from the Anyscale team, was designed to enable general distributed computing applications in Python and can be similarly applied to data-parallel distributed training.

                      "},{"location":"spring2021/lecture-6/#model-parallelism","title":"Model Parallelism","text":"

                      Model parallelism is a lot more complicated. If you can\u2019t fit your entire model\u2019s weights on a single GPU, you can split the weights across GPUs and pass data through each to train the weights. This usually adds a lot of complexity and should be avoided unless absolutely necessary. A better solution is to pony up for the best GPU available, either locally or in the cloud. You can also use gradient checkpointing, a clever trick wherein you write some gradients to disk as you compute them and load them only as you need them for updates.\u00a0 New work is coming out to make this easier (e.g., research and framework maturity).

                      "},{"location":"spring2021/lecture-6/#7-experiment-management","title":"7 - Experiment Management","text":"

                      As you run numerous experiments to refine your model, it\u2019s easy to lose track of code, hyperparameters, and artifacts. Model iteration can lead to lots of complexity and messiness. For example, you could be monitoring the learning rate\u2019s impact on your model\u2019s performance metric. With multiple model runs, how will you monitor the impact of the hyperparameter?

                      A low-tech way would be to manually track the results of all model runs in a spreadsheet. Without great attention to detail, this can quickly spiral into a messy or incomplete artifact. Dedicated experiment management platforms are a remedy to this issue. Let\u2019s cover a few of the most common ones:

                      • TensorBoard: This is the default experiment tracking platform that comes with TensorFlow. As a pro, it\u2019s easy to get started with. On the flip side, it\u2019s not very good for tracking and comparing multiple experiments. It\u2019s also not the best solution to store past work.
                      • MLFlow: An OSS project from Databricks, MLFlow is a complete platform for the ML lifecycle. They have great experiment and model run management at the core of their platform. Another open-source project, Keepsake, recently came out focused solely on experiment tracking.
                      • Paid platforms (Comet.ml, Weights and Biases, Neptune): Finally, outside vendors offer deep, thorough experiment management platforms, with tools like code diffs, report writing, data visualization, and model registering features. In our labs, we will use Weights and Biases.
                      "},{"location":"spring2021/lecture-6/#8-hyperparameter-tuning","title":"8 - Hyperparameter Tuning","text":"

                      To finalize models, we need to ensure that we have the optimal hyperparameters. Since hyperparameter optimization (as this process is called) can be a particularly compute-intensive process, it\u2019s useful to have software that can help. Using specific software can help us kill underperforming model runs with bad hyperparameters early (to save on cost) or help us intelligently sweep ranges of hyperparameter values. Luckily, there\u2019s an increasing number of software providers that do precisely this:

                      • SigOpt offers an API focused exclusively on efficient, iterative hyperparameter optimization. Specify a range of values, get SigOpt\u2019s recommended hyperparameter settings, run the model and return the results to SigOpt, and repeat the process until you\u2019ve found the best parameters for your model.
                      • Rather than an API, Ray Tune offers a local software (part of the broader Ray ecosystem) that integrates hyperparameter optimization with compute resource allocation. Jobs are scheduled with specific hyperparameters according to state-of-the-art methods, and underperforming jobs are automatically killed.
                      • Weights and Biases also has this feature! With a YAML file specification, we can specify a hyperparameter optimization job and perform a \u201csweep,\u201d during which W&B sends parameter settings to individual \u201cagents\u201d (our machines) and compares performance.
                      "},{"location":"spring2021/lecture-6/#9-all-in-one-solutions","title":"9 - \u201cAll-In-One\u201d Solutions","text":"

                      Some platforms integrate all the aspects of the applied ML stack we\u2019ve discussed (experiment tracking, optimization, training, etc.) and wrap them into a single experience. To support the \u201clifecycle,\u201d these platforms typically include:

                      • Labeling and data querying services
                      • Model training, especially though job scaling and scheduling
                      • Experiment tracking and model versioning
                      • Development environments, typically through notebook-style interfaces
                      • Model deployment (e.g., via REST APIs) and monitoring

                      One of the earliest examples of such a system is Facebook\u2019s FBLearner (2016), which encompassed data and feature storage, training, inference, and continuous learning based on user interactions with the model\u2019s outputs. You can imagine how powerful having one hub for all this activity can be for ML application and development speed. As a result, cloud vendors (Google, AWS, Azure) have developed similar all-in-one platforms, like Google Cloud AI Platform and AWS SageMaker. Startups like Paperspace Gradient, Neptune, and FloydHub also offer all-in-one platforms focused on deep learning. Determined AI, which focuses exclusively on the model development and training part of the lifecycle, is the rare open-source platform in this space. Domino Data Lab is a traditional ML-focused startup with an extensive feature set worth looking at. It\u2019s natural to expect more MLOps (as this kind of tooling and infra is referred to) companies and vendors to build out their feature set and become platform-oriented; Weights and Biases is a good example of this.

                      In conclusion, take a look at the below table to compare a select number of MLOps platform vendors. Pricing is quite variable.

                      Staying up to date across all the tooling can be a real challenge, but check out FSDL\u2019s Tooling Tuesdays on Twitter as a starting point!

                      "},{"location":"spring2021/lecture-7/","title":"Lecture 7: Troubleshooting Deep Neural Networks","text":""},{"location":"spring2021/lecture-7/#video","title":"Video","text":""},{"location":"spring2021/lecture-7/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-7/#notes","title":"Notes","text":"

                      Lecture by Josh Tobin. Notes transcribed by James Le and Vishnu Rachakonda.

                      In traditional software engineering, a bug usually leads to the program crashing. While this is annoying for the user, it is critical for the developer to inspect the errors to understand why. With deep learning, we sometimes encounter errors, but all too often, the program crashes without a clear reason why. While these issues can be debugged manually, deep learning models most often fail because of poor output predictions. What\u2019s worse is that when the model performance is low, there is usually no signal about why or when the models failed.

                      A common sentiment among practitioners is that they spend 80\u201390% of time debugging and tuning the models and only 10\u201320% of time deriving math equations and implementing things. This is confirmed by Andrej Kaparthy, as seen in this tweet.

                      "},{"location":"spring2021/lecture-7/#1-why-is-deep-learning-troubleshooting-hard","title":"1 - Why Is Deep Learning Troubleshooting Hard?","text":"

                      Suppose you are trying to reproduce a research paper result for your work, but your results are worse. You might wonder why your model\u2019s performance is significantly worse than the paper that you\u2019re trying to reproduce?

                      Many different things can cause this:

                      • It can be implementation bugs. Most bugs in deep learning are actually invisible.

                      • Hyper-parameter choices can also cause your performance to degrade. Deep learning models are very sensitive to hyper-parameters. Even very subtle choices of learning rate and weight initialization can make a big difference.

                      • Performance can also be worse just because of data/model fit. For example, you pre-train your model on ImageNet data and fit it on self-driving car images, which are harder to learn.

                      • Finally, poor model performance could be caused not by your model but your dataset construction. Typical issues here include not having enough examples, dealing with noisy labels and imbalanced classes, splitting train and test set with different distributions.

                      "},{"location":"spring2021/lecture-7/#2-strategy-to-debug-neural-networks","title":"2 - Strategy to Debug Neural Networks","text":"

                      The key idea of deep learning troubleshooting is: Since it is hard to disambiguate errors, it\u2019s best to start simple and gradually ramp up complexity.

                      This lecture provides a decision tree for debugging deep learning models and improving performance. This guide assumes that you already have an initial test dataset, a single metric to improve, and target performance based on human-level performance, published results, previous baselines, etc.

                      "},{"location":"spring2021/lecture-7/#3-start-simple","title":"3 - Start Simple","text":"

                      The first step is the troubleshooting workflow is starting simple.

                      "},{"location":"spring2021/lecture-7/#choose-a-simple-architecture","title":"Choose A Simple Architecture","text":"

                      There are a few things to consider when you want to start simple. The first is how to choose a simple architecture. These are architectures that are easy to implement and are likely to get you part of the way towards solving your problem without introducing as many bugs.

                      Architecture selection is one of the many intimidating parts of getting into deep learning because there are tons of papers coming out all-the-time and claiming to be state-of-the-art on some problems. They get very complicated fast. In the limit, if you\u2019re trying to get to maximal performance, then architecture selection is challenging. But when starting on a new problem, you can just solve a simple set of rules that will allow you to pick an architecture that enables you to do a decent job on the problem you\u2019re working on.

                      • If your data looks like images, start with a LeNet-like architecture and consider using something like ResNet as your codebase gets more mature.

                      • If your data looks like sequences, start with an LSTM with one hidden layer and/or temporal/classical convolutions. Then, when your problem gets more mature, you can move to an Attention-based model or a WaveNet-like model.

                      • For all other tasks, start with a fully-connected neural network with one hidden layer and use more advanced networks later, depending on the problem.

                      In reality, many times, the input data contains multiple of those things above. So how to deal with multiple input modalities into a neural network? Here is the 3-step strategy that we recommend:

                      • First, map each of these modalities into a lower-dimensional feature space. In the example above, the images are passed through a ConvNet, and the words are passed through an LSTM.

                      • Then we flatten the outputs of those networks to get a single vector for each of the inputs that will go into the model. Then we concatenate those inputs.

                      • Finally, we pass them through some fully-connected layers to an output.

                      "},{"location":"spring2021/lecture-7/#use-sensible-defaults","title":"Use Sensible Defaults","text":"

                      After choosing a simple architecture, the next thing to do is to select sensible hyper-parameter defaults to start with. Here are the defaults that we recommend:

                      • Adam optimizer with a \u201cmagic\u201d learning rate value of 3e-4.

                      • ReLU activation for fully-connected and convolutional models and Tanh activation for LSTM models.

                      • He initialization for ReLU activation function and Glorot initialization for Tanh activation function.

                      • No regularization and data normalization.

                      "},{"location":"spring2021/lecture-7/#normalize-inputs","title":"Normalize Inputs","text":"

                      The next step is to normalize the input data, subtracting the mean and dividing by the variance. Note that for images, it\u2019s fine to scale values to [0, 1] or [-0.5, 0.5] (for example, by dividing by 255).

                      "},{"location":"spring2021/lecture-7/#simplify-the-problem","title":"Simplify The Problem","text":"

                      The final thing you should do is consider simplifying the problem itself. If you have a complicated problem with massive data and tons of classes to deal with, then you should consider:

                      • Working with a small training set around 10,000 examples.

                      • Using a fixed number of objects, classes, input size, etc.

                      • Creating a simpler synthetic training set like in research labs.

                      This is important because (1) you will have reasonable confidence that your model should be able to solve, and (2) your iteration speed will increase.

                      The diagram below neatly summarizes how to start simple:

                      "},{"location":"spring2021/lecture-7/#4-implement-and-debug","title":"4 - Implement and Debug","text":"

                      To give you a preview, below are the five most common bugs in deep learning models that we recognize:

                      • Incorrect shapes for the network tensors: This bug is a common one and can fail silently. This happens many times because the automatic differentiation systems in the deep learning framework do silent broadcasting. Tensors become different shapes in the network and can cause a lot of problems.

                      • Pre-processing inputs incorrectly: For example, you forget to normalize your inputs or apply too much input pre-processing (over-normalization and excessive data augmentation).

                      • Incorrect input to the model\u2019s loss function: For example, you use softmax outputs to a loss that expects logits.

                      • Forgot to set up train mode for the network correctly: For example, toggling train/evaluation mode or controlling batch norm dependencies.

                      • Numerical instability: For example, you get `inf` or `NaN` as outputs. This bug often stems from using an exponent, a log, or a division operation somewhere in the code.

                      Here are three pieces of general advice for implementing your model:

                      • Start with a lightweight implementation. You want minimum possible new lines of code for the 1st version of your model. The rule of thumb is less than 200 lines. This doesn\u2019t count tested infrastructure components or TensorFlow/PyTorch code.

                      • Use off-the-shelf components such as Keras if possible, since most of the stuff in Keras works well out-of-the-box. If you have to use TensorFlow, use the built-in functions, don\u2019t do the math yourself. This would help you avoid a lot of numerical instability issues.

                      • Build complicated data pipelines later. These are important for large-scale ML systems, but you should not start with them because data pipelines themselves can be a big source of bugs. Just start with a dataset that you can load into memory.

                      "},{"location":"spring2021/lecture-7/#get-your-model-to-run","title":"Get Your Model To Run","text":"

                      The first step of implementing bug-free deep learning models is getting your model to run at all. There are a few things that can prevent this from happening:

                      • Shape mismatch/casting issue: To address this type of problem, you should step through your model creation and inference step-by-step in a debugger, checking for correct shapes and data types of your tensors.

                      • Out-of-memory issues: This can be very difficult to debug. You can scale back your memory-intensive operations one-by-one. For example, if you create large matrices anywhere in your code, you can reduce the size of their dimensions or cut your batch size in half.

                      • Other issues: You can simply Google it. Stack Overflow would be great most of the time.

                      Let\u2019s zoom in on the process of stepping through model creation in a debugger and talk about debuggers for deep learning code:

                      • In PyTorch, you can use ipdb\u200a\u2014\u200awhich exports functions to access the interactive IPython debugger.

                      • In TensorFlow, it\u2019s trickier. TensorFlow separates the process of creating the graph and executing operations in the graph. There are three options you can try: (1) step through the graph creation itself and inspect each tensor layer, (2) step into the training loop and evaluate the tensor layers, or (3) use TensorFlow Debugger (tfdb), which does option 1 and 2 automatically.

                      "},{"location":"spring2021/lecture-7/#overfit-a-single-batch","title":"Overfit A Single Batch","text":"

                      After getting your model to run, the next thing you need to do is to overfit a single batch of data. This is a heuristic that can catch an absurd number of bugs. This really means that you want to drive your training error arbitrarily close to 0.

                      There are a few things that can happen when you try to overfit a single batch and it fails:

                      • Error goes up: Commonly, this is due to a flip sign somewhere in the loss function/gradient.

                      • Error explodes: This is usually a numerical issue but can also be caused by a high learning rate.

                      • Error oscillates: You can lower the learning rate and inspect the data for shuffled labels or incorrect data augmentation.

                      • Error plateaus: You can increase the learning rate and get rid of regulation. Then you can inspect the loss function and the data pipeline for correctness.

                      "},{"location":"spring2021/lecture-7/#compare-to-a-known-result","title":"Compare To A Known Result","text":"

                      Once your model overfits in a single batch, there can still be some other issues that cause bugs. The last step here is to compare your results to a known result. So what sort of known results are useful?

                      • The most useful results come from an official model implementation evaluated on a similar dataset to yours. You can step through the code in both models line-by-line and ensure your model has the same output. You want to ensure that your model performance is up to par with expectations.

                      • If you can\u2019t find an official implementation on a similar dataset, you can compare your approach to results from an official model implementation evaluated on a benchmark dataset. You most definitely want to walk through the code line-by-line and ensure you have the same output.

                      • If there is no official implementation of your approach, you can compare it to results from an unofficial model implementation. You can review the code the same as before but with lower confidence (because almost all the unofficial implementations on GitHub have bugs).

                      • Then, you can compare to results from a paper with no code (to ensure that your performance is up to par with expectations), results from your model on a benchmark dataset (to make sure your model performs well in a simpler setting), and results from a similar model on a similar dataset (to help you get a general sense of what kind of performance can be expected).

                      • An under-rated source of results comes from simple baselines (for example, the average of outputs or linear regression), which can help make sure that your model is learning anything at all.

                      The diagram below neatly summarizes how to implement and debug deep neural networks:

                      "},{"location":"spring2021/lecture-7/#5-evaluate","title":"5 - Evaluate","text":""},{"location":"spring2021/lecture-7/#bias-variance-decomposition","title":"Bias-Variance Decomposition","text":"

                      To evaluate models and prioritize the next steps in model development, we will apply the bias-variance decomposition. The bias-variance decomposition is the fundamental model fitting tradeoff. In our application, let\u2019s talk more specifically about the formula for bias-variance tradeoff with respect to the test error; this will help us apply the concept more directly to our model\u2019s performance. There are four terms in the formula for test error:

                      Test error = irreducible error + bias + variance + validation overfitting

                      1. Irreducible error is the baseline error you don\u2019t expect your model to do better. It can be estimated through strong baselines, like human performance.

                      2. Avoidable bias, a measure of underfitting, is the difference between our train error and irreducible error.

                      3. Variance, a measure of overfitting, is the difference between validation error and training error.

                      4. Validation set overfitting is the difference between test error and validation error.

                      Consider the chart of learning curves and errors below. Using the test error formula for bias and variance, we can calculate each component of test error and make decisions based on the value. For example, our avoidable bias is rather low (only 2 points), while the variance is much higher (5 points). With this knowledge, we should prioritize methods of preventing overfitting, like regularization.

                      "},{"location":"spring2021/lecture-7/#distribution-shift","title":"Distribution Shift","text":"

                      Clearly, the application of the bias-variance decomposition to the test error has already helped prioritize our next steps for model development. However, until now, we\u2019ve assumed that the samples (training, validation, testing) all come from the same distribution. What if this isn\u2019t the case? In practical ML situations, this distribution shift often cars. In building self-driving cars, a frequent occurrence might be training with samples from one distribution (e.g., daytime driving video) but testing or inferring on samples from a totally different distribution (e.g., night time driving).

                      A simple way of handling this wrinkle in our assumption is to create two validation sets: one from the training distribution and one from the test distribution. This can be helpful even with a very small testing set. If we apply this, we can actually estimate our distribution shift, which is the difference between testing validation error and testing error. This is really useful for practical applications of ML! With this new term, let\u2019s update our test error formula of bias and variance:

                      Test error = irreducible error + bias + variance + distribution shift + validation overfitting

                      "},{"location":"spring2021/lecture-7/#6-improve-model-and-data","title":"6 - Improve Model and Data","text":"

                      Using the updated formula from the last section, we\u2019ll be able to decide on and prioritize the right next steps for each iteration of a model. In particular, we\u2019ll follow a specific process (shown below).

                      "},{"location":"spring2021/lecture-7/#step-1-address-underfitting","title":"Step 1: Address Underfitting","text":"

                      We\u2019ll start by addressing underfitting (i.e., reducing bias). The first thing to try in this case is to make your model bigger (e.g., add layers, more units per layer). Next, consider regularization, which can prevent a tight fit to your data. Other options are error analysis, choosing a different model architecture (e.g., something more state of the art), tuning hyperparameters, or adding features. Some notes:

                      • Choosing different architectures, especially a SOTA one, can be very helpful but is also risky. Bugs are easily introduced in the implementation process.

                      • Adding features is uncommon in the deep learning paradigm (vs. traditional machine learning). We usually want the network to learn features of its own accord. If all else fails, it can be beneficial in a practical setting.

                      "},{"location":"spring2021/lecture-7/#step-2-address-overfitting","title":"Step 2: Address Overfitting","text":"

                      After addressing underfitting, move on to solving overfitting. Similarly, there\u2019s a recommended series of methods to try in order. Starting with collecting training data (if possible) is the soundest way to address overfitting, though it can be challenging in certain applications. Next, tactical improvements like normalization, data augmentation, and regularization can help. Following these steps, traditional defaults like tuning hyperparameters, choosing a different architecture, or error analysis are useful. Finally, if overfitting is rather intractable, there\u2019s a series of less recommended steps, such as early stopping, removing features, and reducing model size. Early stopping is a personal choice; the fast.ai community is a strong proponent.

                      "},{"location":"spring2021/lecture-7/#step-3-address-distribution-shift","title":"Step 3: Address Distribution Shift","text":"

                      After addressing underfitting and overfitting, If there\u2019s a difference between the error on our training validation set vs. our test validation set, we need to address the error caused by the distribution shift. This is a harder problem to solve, so there\u2019s less in our toolkit to apply.

                      Start by looking manually at the errors in the test-validation set. Compare the potential logic behind these errors to the performance in the train-validation set, and use the errors to guide further data collection. Essentially, reason about why your model may be suffering from distribution shift error. This is the most principled way to deal with distribution shift, though it\u2019s the most challenging way practically. If collecting more data to address these errors isn\u2019t possible, try synthesizing data. Additionally, you can try domain adaptation.

                      "},{"location":"spring2021/lecture-7/#error-analysis","title":"Error Analysis","text":"

                      Manually evaluating errors to understand model performance is generally a high-yield way of figuring out how to improve the model. Systematically performing this error analysis process and decomposing the error from different error types can help prioritize model improvements. For example, in a self-driving car use case with error types like hard-to-see pedestrians, reflections, and nighttime scenes, decomposing the error contribution of each and where it occurs (train-val vs. test-val) can give rise to a clear set of prioritized action items. See the table for an example of how this error analysis can be effectively structured.

                      "},{"location":"spring2021/lecture-7/#domain-adaptation","title":"Domain Adaptation","text":"

                      Domain adaptation is a class of techniques that train on a \u201csource\u201d distribution and generalize to another \u201ctarget\u201d using only unlabeled data or limited labeled data. You should use domain adaptation when access to labeled data from the test distribution is limited, but access to relatively similar data is plentiful.

                      There are a few different types of domain adaptation:

                      1. Supervised domain adaptation: In this case, we have limited data from the target domain to adapt to. Some example applications of the concept include fine-tuning a pre-trained model or adding target data to a training set.

                      2. Unsupervised domain adaptation: In this case, we have lots of unlabeled data from the target domain. Some techniques you might see are CORAL, domain confusion, and CycleGAN.

                      Practically speaking, supervised domain adaptation can work really well! Unsupervised domain adaptation has a little bit further to go.

                      "},{"location":"spring2021/lecture-7/#step-4-rebalance-datasets","title":"Step 4: Rebalance datasets","text":"

                      If the test-validation set performance starts to look considerably better than the test performance, you may have overfit the validation set. This commonly occurs with small validation sets or lots of hyperparameter training. If this occurs, resample the validation set from the test distribution and get a fresh estimate of the performance.

                      "},{"location":"spring2021/lecture-7/#7-tune-hyperparameters","title":"7 - Tune Hyperparameters","text":"

                      One of the core challenges in hyperparameter optimization is very basic: which hyperparameters should you tune? As we consider this fundamental question, let\u2019s keep the following in mind:

                      • Models are more sensitive to some hyperparameters than others. This means we should focus our efforts on the more impactful hyperparameters.

                      • However, which hyperparameters are most important depends heavily on our choice of model.

                      • Certain rules of thumbs can help guide our initial thinking.

                      • Sensitivity is always relative to default values; if you use good defaults, you might start in a good place!

                      See the following table for a ranked list of hyperparameters and their impact on the model:

                      "},{"location":"spring2021/lecture-7/#techniques-for-tuning-hyperparameter-optimization","title":"Techniques for Tuning Hyperparameter Optimization","text":"

                      Now that we know which hyperparameters make the most sense to tune (using rules of thumb), let\u2019s consider the various methods of actually tuning them:

                      1. Manual Hyperparameter Optimization. Colloquially referred to as Graduate Student Descent, this method works by taking a manual, detailed look at your algorithm, building intuition, and considering which hyperparameters would make the most difference. After figuring out these parameters, you train, evaluate, and guess a better hyperparameter value using your intuition for the algorithm and intelligence. While it may seem archaic, this method combines well with other methods (e.g., setting a range of values for hyperparameters) and has the main benefit of reducing computation time and cost if used skillfully. It can be time-consuming and challenging, but it can be a good starting point.

                      2. Grid Search. Imagine each of your parameters plotted against each other on a grid, from which you uniformly sample values to test. For each point, you run a training run and evaluate performance. The advantages are that it\u2019s very simple and can often produce good results. However, it\u2019s quite inefficient, as you must run every combination of hyperparameters. It also often requires prior knowledge about the hyperparameters since we must manually set the range of values.

                      3. Random Search: This method is recommended over grid search. Rather than sampling from the grid of values for the hyperparameter evenly, we\u2019ll choose n points sampled randomly across the grid. Empirically, this method produces better results than grid search. However, the results can be somewhat uninterpretable, with unexpected values in certain hyperparameters returned.

                      4. Coarse-to-fine Search: Rather than running entirely random runs, we can gradually narrow in on the best hyperparameters through this method. Initially, start by defining a very large range to run a randomized search on. Within the pool of results, you can find N best results and hone in on the hyperparameter values used to generate those samples. As you iteratively perform this method, you can get excellent performance. This doesn\u2019t remove the manual component, as you have to select which range to continuously narrow your search to, but it\u2019s perhaps the most popular method available.

                      5. Bayesian Hyperparameter Optimization: This is a reasonably sophisticated method, which you can read more about here and here. At a high level, start with a prior estimate of parameter distributions. Subsequently, maintain a probabilistic model of the relationship between hyperparameter values and model performance. As you maintain this model, you toggle between training with hyperparameter values that maximize the expected improvement (per the model) and use training results to update the initial probabilistic model and its expectations. This is a great, hands-off, efficient method to choose hyperparameters. However, these techniques can be quite challenging to implement from scratch. As libraries and infrastructure mature, the integration of these methods into training will become easier.

                      In summary, you should probably start with coarse-to-fine random searches and move to Bayesian methods as your codebase matures and you\u2019re more certain of your model.

                      "},{"location":"spring2021/lecture-7/#8-conclusion","title":"8 - Conclusion","text":"

                      To wrap up this lecture, deep learning troubleshooting and debugging is really hard. It\u2019s difficult to tell if you have a bug because there are many possible sources for the same degradation in performance. Furthermore, the results can be sensitive to small changes in hyper-parameters and dataset makeup.

                      To train bug-free deep learning models, we need to treat building them as an iterative process. If you skipped to the end, the following steps can make this process easier and catch errors as early as possible:

                      • Start Simple: Choose the simplest model and data possible.

                      • Implement and Debug: Once the model runs, overfit a single batch and reproduce a known result.

                      • Evaluate: Apply the bias-variance decomposition to decide what to do next.

                      • Tune Hyper-parameters: Use coarse-to-fine random searches to tune the model\u2019s hyper-parameters.

                      • Improve Model and Data: Make your model bigger if your model under-fits and add more data and/or regularization if your model over-fits.

                      Here are additional resources that you can go to learn more:

                      • Andrew Ng\u2019s \u201cMachine Learning Yearning\u201d book.

                      • This Twitter thread from Andrej Karpathy.

                      • BYU\u2019s \u201cPractical Advice for Building Deep Neural Networks\u201d blog post.

                      "},{"location":"spring2021/lecture-8/","title":"Lecture 8: Data Management","text":""},{"location":"spring2021/lecture-8/#video","title":"Video","text":""},{"location":"spring2021/lecture-8/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-8/#notes","title":"Notes","text":"

                      Lecture by Sergey Karayev. Notes transcribed by James Le and Vishnu Rachakonda.

                      One of the best data science articles written in 2019 is \u201cData science is different now\u201d by Vicki Boykis. Part of the article is a collection of tweets from other data science and machine learning practitioners.

                      "},{"location":"spring2021/lecture-8/#1-data-management-overview","title":"1 - Data Management Overview","text":"

                      When we think about what data management for deep learning entails, there might be many different data sources: images on S3, text files on a file system, logs spread across different machines, and maybe even records in a database. At some point, you need to get all of that data over to a local filesystem next to GPUs. The way you will get data over to that trainable format is different for every project and every company. For instance:

                      • Maybe you train your images on ImageNet, and all the images are just S3 URLs. Then, all you have to do is download them over to the local filesystem.

                      • Maybe you have a bunch of text files that you crawled yourself somewhere. You want to use Spark to process them on a cluster and Pandas data frame to analyze/select subsets that will be used in the local filesystem.

                      • Maybe you collect logs and records from your database into a data lake/warehouse (like Snowflake). Then, you process that output and convert them into a trainable format.

                      There are countless possibilities that we are not going to cover completely in this lecture, but here are the key points to remember:

                      • Let the data flow through you: You should spend 10x as much time as you want to on exploring the dataset.

                      • Data is the best way to improve your overall ML project performance: Instead of trying new architectures or kicking off the hyper-parameter search, adding more data and augmenting the existing dataset will often be the best bang to your buck.

                      • Keep It Simple Stupid: We will discuss complex pipelines and new terms, but it\u2019s important to not over-complicate things and make data management a rocket science.

                      "},{"location":"spring2021/lecture-8/#2-data-sources","title":"2 - Data Sources","text":"

                      So, where do the training data come from? Most deep learning applications require lots of labeled data (with exceptions in applications of reinforcement learning, GANs, and GPT-3). There are publicly available datasets that can serve as a starting point, but there is no competitive advantage of using them. In fact, most companies usually spend a lot of money and time labeling their own data.

                      "},{"location":"spring2021/lecture-8/#data-flywheel","title":"Data Flywheel","text":"

                      Data flywheel is an exciting concept: if you can get your models in front of the users, you can build your products in a mechanism that your users contribute good data back to you and improve the model predictions. This can enable rapid improvement after you get that v1 model out into the real world.

                      "},{"location":"spring2021/lecture-8/#semi-supervised-learning","title":"Semi-Supervised Learning","text":"

                      Semi-supervised learning is a relatively recent learning technique where the training data is autonomously (or automatically) labeled. It is still supervised learning, but the datasets do not need to be manually labeled by a human; but they can be labeled by finding and exploiting the relations (or correlations) between different input signals (that is, input coming from different sensor modalities).

                      A natural advantage and consequence of semi-supervised learning are that this technique can be performed in an online fashion (given that data can be gathered and labeled without human intervention) more easily (with respect to, e.g., supervised learning), where models can be updated or trained entirely from scratch. Therefore, semi-supervised learning should also be well suited for changing environments, changing data, and, in general, changing requirements.

                      For a text example, you can predict the future words from the past words, predict the beginning of a sentence from the end of a sentence, or predict the middle word of a sentence from the words surrounding it. You can even examine whether two sentences occur in the same paragraph in the same corpus of your training data. These are different ways to formulate the problem, where you don\u2019t need to label anything and simply use the data to supervise itself.

                      This technique also applies to vision. Facebook AI recently released a model called SEER trained on 1 billion random images from the Internet. Yet, SEER achieved state-of-the-art accuracy on the ImageNet top-1 prediction task.

                      If you\u2019re interested in learning more about semi-supervised learning, check out:

                      • Lilian Weng's \"Self-Supervised Learning\" post

                      • Facebook AI\u2019s \u201cSelf-Supervised Learning: The Dark Matter Of Intelligence\u201d post

                      • Facebook AI\u2019s VISSL library for the SEER algorithm

                      "},{"location":"spring2021/lecture-8/#data-augmentation","title":"Data Augmentation","text":"

                      Recent advances in deep learning models have been largely attributed to the quantity and diversity of data gathered in recent years. Data augmentation is a strategy that enables practitioners to significantly increase the diversity of data available for training models without actually collecting new data. Data augmentation techniques such as cropping, padding, and horizontal flipping are commonly used to train large neural networks. In fact, they are mostly required for training computer vision models. Both Keras and fast.ai provide functions that do this.

                      Data augmentation also applies to other types of data.

                      • For tabular data, you can delete some cells to simulate missing data.

                      • For text, there are no well-established techniques, but you can replace words with synonyms and change the order of things.

                      • For speech and video, you can change speed, insert a pause, mix different sequences, and more.

                      If you\u2019re interested in learning more about data augmentation, check out:

                      • Berkeley AI\u2019s \u201c1000x Faster Data Augmentation\u201d post

                      • Edward Ma\u2019s \u201cnlpaug\u201d repository

                      "},{"location":"spring2021/lecture-8/#synthetic-data","title":"Synthetic Data","text":"

                      Related to the concept of data augmentation is synthetic data, an underrated idea that is almost always worth starting with. Synthetic data is data that\u2019s generated programmatically. For example, photorealistic images of objects in arbitrary scenes can be rendered using video game engines or audio generated by a speech synthesis model from the known text. It\u2019s not unlike traditional data augmentation, where crops, flips, rotations, and distortions are used to increase the variety of data that models have to learn from. Synthetically generated data takes those same concepts even further.

                      Most of today\u2019s synthetic data is visual. Tools and techniques developed to create photorealistic graphics in movies and computer games are repurposed to create the training data needed for machine learning. Not only can these rendering engines produce arbitrary numbers of images, but they can also produce annotations too. Bounding boxes, segmentation masks, depth maps, and any other metadata is output right alongside pictures, making it simple to build pipelines that produce their own data.

                      Because samples are generated programmatically along with annotations, synthetic datasets are far cheaper to produce than traditional ones. That means we can create more data and iterate more often to produce better results. Need to add another class to your model? No problem. Need to add another key point to the annotation? Done. This is especially useful for applications in driving and robotics.

                      If you\u2019re interested in learning more about synthetic data, check out:

                      • Dropbox\u2019s \u201cCreating A Modern OCR Pipeline Using Computer Vision and Deep Learning\u201d post

                      • Andrew Moffat\u2019s \u201cmetabrite-receipt-tests\u201d repository

                      • Microsoft\u2019s AirSim simulator

                      • OpenAI\u2019s \u201cIngredients For Robotics Research\u201d post

                      "},{"location":"spring2021/lecture-8/#3-data-storage","title":"3 - Data Storage","text":"

                      Data storage requirements for AI vary widely according to the application and the source material. Datasets in intelligence, defense, medical, science, and geology frequently combine petabyte-scale storage volumes with individual file sizes in the gigabyte range. By contrast, data used in areas such as supply chain analytics and fraud detection are much smaller.

                      There are four building blocks in a data storage system:

                      1. The filesystem

                      2. The object storage

                      3. The database

                      4. The data lake or data warehouse

                      "},{"location":"spring2021/lecture-8/#filesystem","title":"Filesystem","text":"

                      The filesystem is the foundational layer of storage.

                      • Its fundamental unit is a \u201cfile\u201d\u200a\u2014\u200awhich can be text or binary, is not versioned, and is easily overwritten.

                      • A file system can be as simple as a locally mounted disk containing all the files you need.

                      • More advanced options include networked filesystems (NFS), which are accessible over the network by multiple machines, and distributed file systems (HDFS) which are stored and accessed over multiple machines.

                      The plots above display hard-drive speeds for SATA hard drive, SATA SSD, and NVMe SSD.

                      • The left plot shows the sustained throughput in MBps (how much information to copy a file): The latest iteration of hard drive technology (NVMe) is 6-10x more powerful than older iterations.

                      • The right plot shows the seek time in milliseconds (how long it takes to go to a file on disk): The NVMe is 25-30x faster than the old-school ones.

                      What format should we store data in?

                      • For binary data (images, audios, videos), just files are enough. In Tensorflow, you have the TFRecord format to batch binary files, which does not seem to be necessary with the NVMe hard drives.

                      • For large tabular and text data, you have two choices:

                        • HDF5 is powerful but bloated and declining.

                        • Parquet is widespread and recommended.

                        • Feather is an up-and-coming open-source option powered by Apache Arrow.

                      • Both Tensorflow and PyTorch provide their native dataset class interfaces (tf.data and PyTorch DataLoader).

                      "},{"location":"spring2021/lecture-8/#object-storage","title":"Object Storage","text":"

                      Object storage is an API over the filesystem that allows users to use a command on files (GET, PUT, DELETE) to a service without worrying where they are actually stored.

                      • Its fundamental unit is an \u201cobject,\u201d\u200a \u200awhich is usually binary (images, sound files\u2026).

                      • Object storage can be built with data versioning and data redundancy into the API.

                      • It is not as fast as local files but fast enough within the cloud.

                      "},{"location":"spring2021/lecture-8/#database","title":"Database","text":"

                      A database is a persistent, fast, scalable storage and retrieval of structured data.

                      • Its fundamental unit is a \u201crow\u201d (unique IDs, references to other rows, values in columns).

                      • Databases are also known for online transaction processing (OLTP). The mental model here is that everything is actually in memory, but the software ensures that everything is logged to disk and never lost.

                      • Databases are not built for binary data, so you must store the references (i.e., S3 URLs) instead.

                      Here are our recommendations:

                      • PostgreSQL is the right choice most of the time, thanks to the support of unstructured JSON.

                      • SQLite is perfectly good for small projects.

                      • \u201cNoSQL\u201d was a big craze in the 2010s (like MongoDB). However, they are not as fast as the relational database and also have consistency issues frequently.

                      • Redis is handy when you need a simple key-value store.

                      "},{"location":"spring2021/lecture-8/#data-warehouse","title":"Data Warehouse","text":"

                      A data warehouse is a structured aggregation of data for analysis, known as online analytical processing (OLAP).

                      Another acronym that you might have heard of is ETL (Extract, Transform, Load). The idea here is to extract data from data sources, transform the data into a common schema, and load the schema into the data warehouse. You can load the subset from the warehouse that you need and generate reports or run analytical queries. Well-known enterprise options in the market are Google BigQuery, Amazon Redshift, and Snowflake.

                      "},{"location":"spring2021/lecture-8/#sql-and-dataframes","title":"SQL and DataFrames","text":"

                      Most data solutions use SQL as the interface to the data, except for some (like Databricks) that use DataFrames. SQL is the standard interface for structured data. But in the Python ecosystem, Pandas is the main DataFrame. Our advice is to become fluent in both.

                      "},{"location":"spring2021/lecture-8/#data-lake","title":"Data Lake","text":"

                      A data lake is the unstructured aggregation of data from multiple sources (databases, logs, expensive data transformations). It operates under the concept of ELT (Extract, Load, Transform) by dumping everything in the lake and transforming the data for specific needs later.

                      "},{"location":"spring2021/lecture-8/#data-lakehouse","title":"Data \u201cLakehouse\u201d","text":"

                      The current trend in the field is to combine data warehouses and data lakes in the same suite. The Databricks Lakehouse Platform is both a warehouse and a lake, operated as an open-source project called Delta Lake. You can store both structured and unstructured data in the platform and use them for analytics workloads and machine learning engines.

                      "},{"location":"spring2021/lecture-8/#what-goes-where","title":"What Goes Where?","text":"
                      • Binary data (images, sound files, compressed texts) are stored as objects.

                      • Metadata (labels, user activity) is stored in a database.

                      • If we need features that are not obtainable from the database (logs), we would want to set up a data lake and a process to aggregate the data required.

                      • At training time, we need to copy the necessary data to the filesystem on a fast drive.

                      A lot is going on within the data management tooling and infrastructure. We recommend looking at a16z\u2019s \u201cEmerging Architectures For Modern Data Infrastructure\u201d article to get a broad look into this ecosystem.

                      A highly recommended resource is Martin Kleppmann\u2019s book \u201cDesigning Data-Intensive Applications,\u201d\u200a \u200awhich provides excellent coverage of tools and approaches to build reliable, scalable, and maintainable data storage systems.

                      "},{"location":"spring2021/lecture-8/#4-data-processing","title":"4 - Data Processing","text":""},{"location":"spring2021/lecture-8/#data-dependencies","title":"Data Dependencies","text":"

                      Let\u2019s look at a motivational example of training a photo popularity predictor every night. For each photo, the training data must include these components:

                      • Metadata (such as posting time, title, location) that is in the database.

                      • Some features of the user (such as how many times they logged in today) that need to be computed from logs.

                      • Outputs of photo classifiers (such as content, style) that can be obtained after running the classifiers.

                      The idea is that we have different sources of data, and they have different dependencies. The big hurdle here is that some tasks can\u2019t be started until other tasks are finished. Finishing a task should \u201ckick-off\u201d its dependencies.

                      The simplest thing we can do is a \u201cMakefile\u201d to specify what action(s) depend on. But here are some limitations to this approach:

                      • What if re-computation needs to depend on content, not on a date?

                      • What if the dependencies are not files but disparate programs and databases?

                      • What if the work needs to be spread over multiple machines?

                      • What if many dependency graphs are executing all at once, with shared dependencies?

                      "},{"location":"spring2021/lecture-8/#mapreduce","title":"MapReduce","text":"

                      The old-school big data solutions to this are Hadoop and Apache Spark. These are MapReduce implementations, where you launch different tasks that each take a bit of the data (Map) and reduce their outputs into a single output (Reduce). Both Hadoop and Spark can run data processing operations and simple ML models on commodity hardware, with tricks to speed things up.

                      In the modern environment, you can\u2019t run an ML model (in PyTorch or TensorFlow) as part of running a Spark job (unless that model itself is programmed in Spark). That\u2019s when you need a workflow management system like Apache Airflow.

                      "},{"location":"spring2021/lecture-8/#dag","title":"DAG","text":"

                      In Airflow, a workflow is defined as a collection of tasks with directional dependencies, basically a directed acyclic graph (DAG). Each node in the graph is a task, and the edges define dependencies among the tasks. Tasks belong to two categories: (1) operators that execute some operation and (2) sensors that check for the state of a process or a data structure.

                      The main components of Airflow include: (1) a metadata database that stores the state of tasks and workflows, (2) a scheduler that uses the DAGs definitions together with the state of tasks in the metadata database to decide what needs to be executed, and (3) an executor that determines which worker will execute each task.

                      Besides Airflow, here are other notable solutions:

                      • Apache Beam: The TensorFlow team uses Apache Beam to generate big datasets and run those processing steps on Google Cloud Dataflow (a cloud orchestrator).

                      • Prefect: A similar idea to Airflow, Prefect is a Python framework that makes it easy to combine tasks into workflows, then deploy, schedule, and monitor their execution through the Prefect UI or API.

                      • dbt: dbt provides this data processing ability in SQL (called \u201canalytics engineering.\u201d)

                      • Dagster: Dagster is another data orchestrator for ML, analytics, and ETL. You can test locally and run anywhere with a unified view of data pipelines and assets.

                      "},{"location":"spring2021/lecture-8/#5-feature-store","title":"5 - Feature Store","text":"

                      Feature stores were first popularized by the ML team at Uber as part of their Michelangelo platform. Traditionally, ML systems are divided into two portions, offline processing and online processing.

                      • For the initial work of modeling, data that is generally static, perhaps stored in a data lake. Using some preprocessing methods (usually in SQL or Spark), data, which could be logfiles, requests, etc., are converted into features used to develop and train the model. The end result of this process is a model trained on a static sample of the data. This is an offline process.

                      • In contrast, the process of performing inference (e.g., Uber\u2019s need to return ride prices in real-time) often works with real-time data in an online process fashion. From a technology standpoint, whereas the offline use case might involve a data lake and Spark/SQL, the online processing use case involves technologies like Kafka and Cassandra that support speedier processing of creating or accessing the features required to perform inference.

                      This difference in how features need to be created and accessed is a natural place for bugs to crop up. Harmonization of the online and offline processes would reduce bugs, so the Uber team, amongst others, introduced the concept of features stores to do just that. Members of the Uber team developed Tecton, a feature store company, which is one option to implement this system. An open-source alternative is Feast. To summarize, Tecton offers a handy definition of what a feature store is: \u201can ML-specific data system that runs data pipelines that transform raw data into feature values, stores and manages the feature data itself, and serves feature data consistently for training and inference purposes.\u201d

                      A word of caution: don\u2019t over-engineer your system according to what others are doing. It\u2019s easy to wrap yourself up in adopting many tools and systems that aren\u2019t as optimal as their publicity may make them seem. Work with the tools you have first! For an interesting example of this, look at how \u201ccommand-line tools can be 235x faster than your Hadoop cluster\u201d.

                      "},{"location":"spring2021/lecture-8/#6-data-exploration","title":"6 - Data Exploration","text":"

                      The objective of data exploration is to understand and visualize the nature of the data you\u2019re modeling.

                      • Pandas is the Python workhorse of data visualization. It\u2019s highly recommended to be familiar with it.

                      • Dask is an alternative that can speed up data processing for large datasets that Pandas cannot handle through parallelization.

                      • Similarly, RAPIDS speeds up large dataset processing, though it does through the use of GPUs.

                      "},{"location":"spring2021/lecture-8/#7-data-labeling","title":"7 - Data Labeling","text":"

                      Effective data labeling is a core ingredient of production machine learning systems. Most data labeling platforms have a standard set of features: the ability to generate bounding boxes, segmentations, key points, class assignments, etc. The crucial objective is agreeing on what makes a good annotation and training annotators accordingly. To avoid annotator error cropping up, write clear guidelines that clarify rules for edge cases and high-quality annotations. One way to acquire the material needed to write such a guide is to start by annotating yourself. As you generate labels, ensure the quality of the annotations holds up across the annotator base. Some participants will be more reliable than others.

                      To develop an annotator base, there are a few options.

                      "},{"location":"spring2021/lecture-8/#sources-of-labor","title":"Sources of Labor","text":"
                      • One option is to hire your own annotators, which can help with the speed and quality of annotations. This, however, can be expensive and difficult to scale.

                      • Another option is to crowdsource labels via a platform like Amazon Mechanical Turk, which is fast and cheap to set up, but for which the quality can be poorer.

                      • \u2026or full-service data labeling companies.

                      "},{"location":"spring2021/lecture-8/#service-companies","title":"Service Companies","text":"

                      There are entire service companies that focus on data labeling that you can hire. Hiring such a company makes a great deal of sense, considering the time, labor, and software investment needed to label well at scale. To figure out the best data labeling company, start by annotating some gold standard data yourself. Then, contact and evaluate several companies on their value and a sample labeling task. Some companies in this space are FigureEight, Scale.ai, Labelbox, and Supervisely.

                      "},{"location":"spring2021/lecture-8/#software","title":"Software","text":"

                      If the costs of a full-service data labeling company are prohibitive, pure-play labeling software can be an option.

                      • Label Studio is a friendly open-source platform for this. New concepts to make labeling more strategic and efficient are coming to the fore.

                      • Aquarium helps you explore your data extensively and map the appropriate labeling strategy for classes that may be less prevalent or performant.

                      • Snorkel.ai offers a platform that incorporates weak supervision, which automatically labels data points based on heuristics and human feedback.

                      In summary, if you can afford not to label, don\u2019t; get a full-service company to take care of it. Failing that, try to use existing software and a part-time annotator base work (in lieu of a crowdsourced workforce).

                      "},{"location":"spring2021/lecture-8/#8-data-versioning","title":"8 - Data Versioning","text":"

                      Data versioning is important because machine learning models are part code and part data. If the data isn\u2019t versioned, the system isn\u2019t fully versioned! There are four levels to data versioning, which is similar to code versioning:

                      Level 0: No versioning.

                      • All data lives on a filesystem, in S3, and/or in a database.

                      • The problem arises most acutely in this paradigm, as deployed ML systems (whose code may be versioned) can quickly become divorced from their corresponding data.

                      • Furthermore, reverting to older versions will be challenging.

                      Level 1: Storing a snapshot of everything at training time.

                      • This works and can help you revert, but it\u2019s very hacky.

                      • Rather than doing this entire process manually, let\u2019s try to version automatically.

                      Level 2: Versioned as a mix of assets and code.

                      • You store the large files with unique IDs in S3, with corresponding reference JSON versioned with code.

                      • You should avoid storing the data directly in the repository, as the metadata itself can get pretty large. Using git-lfs lets you store them just as easily as code.

                      • The git signature + of the raw data file fully defines a model\u2019s data and code.

                      Level 3: Specialized solutions for version data.

                      • You should avoid them until you can identify their unique value add to your project.

                      • Some options here are DVC are Pachyderm. DVC has a Git-like workflow worth taking a closer look at. Dolt versions databases, if that\u2019s your need.

                      "},{"location":"spring2021/lecture-8/#9-data-privacy","title":"9 - Data Privacy","text":"

                      Increasingly, unfettered access to data for machine learning is less desirable and prevalent. This is especially true in regulated industries like healthcare and finance. To address such challenges, researchers are developing new data privacy techniques.

                      • Federated learning trains a global model on several local devices without ever acquiring global access to the data. Federated learning is still research-use only due to these issues: (1) sending model updates can be expensive, (2) the depth of anonymization is not clear, and (3) system heterogeneity when it comes to training is unacceptably high.

                      • Another research area is differential privacy, which tries to aggregate data in ways that prevent identification. Finally, learning on encrypted data has potential. Most data privacy efforts are research-focused, as the tooling is not yet mature.

                      "},{"location":"spring2021/lecture-9/","title":"Lecture 9: AI Ethics","text":""},{"location":"spring2021/lecture-9/#video","title":"Video","text":""},{"location":"spring2021/lecture-9/#slides","title":"Slides","text":"

                      Download slides as PDF

                      "},{"location":"spring2021/lecture-9/#notes","title":"Notes","text":"

                      Lecture by Sergey Karayev. Notes transcribed by James Le and Vishnu Rachakonda.

                      A preamble: Ethics is a vast subject spanning many disciplines and addressing many real different problems. As ML practitioners, we need to have a student mindset and do not assume we have the answers because these are not easy problems.

                      "},{"location":"spring2021/lecture-9/#1-what-is-ethics","title":"1 - What is Ethics?","text":"

                      Let\u2019s start with the definition of ethics:

                      • Ethics are not feelings because your feelings might mislead you.

                      • Ethics are not laws because ethics can supersede laws.

                      • Ethics are not societal beliefs because even an immoral society has its set of ethics.

                      "},{"location":"spring2021/lecture-9/#ethical-theories","title":"Ethical Theories","text":"

                      Kevin Binz put together a tour of ethical theories, including:

                      • The divine command theory states that a behavior is moral if the divine commands it. This theory might be accurate, but philosophy doesn\u2019t engage with it.

                      • The virtue ethics theory states that a behavior is moral if it upholds a person\u2019s virtues (bravery, generosity, love, etc.). This theory is apparently robust to philosophical inquiry, but there is increasing evidence that virtues are not persistent across a person\u2019s life and somewhat illusory.

                      • The deontology (duty-based) theory states that a behavior is moral if it satisfies the categorical imperative (i.e., don\u2019t lie, don\u2019t kill). This theory might lead to counter-intuitive moral decisions in many situations and has unacceptable inflexibility to many people.

                      • The utilitarian theory states that a behavior is moral if it brings the most good to the most people. But of course, how do we measure utility?

                      There does not seem to be a clear winner among professional philosophers. From this survey, there appears to be an even split between virtue, deontology, and utilitarianism.

                      "},{"location":"spring2021/lecture-9/#the-trolley-problem","title":"The Trolley Problem","text":"

                      The \u201ctrolley problem\u201d is often used to gain intuition about a person\u2019s ethics by presenting to him/her a moral dilemma. The classic dilemma is that: You see a trolley that is about to run over five people. But you can divert it to run over only one person. Would you do it? It actually leads to a lot of good memes. \ud83e\udd23

                      Another prominent ethical theory is John Rawl\u2019s theory of justice. Rawls argued that equal distribution of resources should be the desirable state of nature instead of following utilitarian philosophies. A Theory of Justice holds that every individual has an equal right to basic liberties. They should have the right to opportunities and an equal chance as other individuals of similar ability.

                      When ethics are applied to technology, it\u2019s essential to understand that they are not static and change with technological progress. Some examples include the industrial revolution, the right to Internet access, birth control, surrogate pregnancy, embryo selection, artificial womb, lab-grown meat, and much more. An excellent book to explore is Juan Enriquez\u2019s \u201cRight/Wrong: How Technology Transforms Our Ethics.\u201d

                      "},{"location":"spring2021/lecture-9/#2-long-term-ai-ethical-problems","title":"2 - Long-Term AI Ethical Problems","text":""},{"location":"spring2021/lecture-9/#autonomous-weapons","title":"Autonomous Weapons","text":"

                      The first example that came to a lot of people\u2019s minds is autonomous weapons. It might be tempting to dismiss it as far-fetched and unrealistic at this time. But as the saying goes, \u201cthe future is already here, just not evenly distributed\u201d:

                      • Israel apparently has autonomous \u2018robo-snipers\u2019 on its borders today.

                      • NYPD has been deploying Boston Dynamics robots in crime situations.

                      "},{"location":"spring2021/lecture-9/#lost-human-labor","title":"Lost Human Labor","text":"

                      Replacing human labor is another concern that has been creeping upon us. With the pandemic, you probably saw many articles saying that millions of people have lost jobs and probably will never get them back (replaced by AI). This could be both good and bad. \ud83e\udd14

                      It\u2019s bad if there are no social safety net and no other jobs for the unemployed. It\u2019s good because there is a megatrend of the demographic inversion. As the world\u2019s population tops out and baby booms vary across regions, the economy can\u2019t function as currently designed. Therefore, we need labor from somewhere. Rodney Brooks, a roboticist from MIT and the founder of iRobot, advocates for having robots in order to have a functioning economy in the next few decades.

                      An interesting spin on this worry is that AI is not necessarily replacing human labor but controlling human labor. This article from The Verge provides more details about working in conditions in warehouses, call centers, and other sectors.

                      If you want to go down the rabbit hole, check out this series \u201cManna - Two Views of Humanity\u2019s Future\u201d from Marshall Brain.

                      "},{"location":"spring2021/lecture-9/#human-extinction","title":"Human Extinction","text":"

                      The final worry is that if AI is superintelligent, then it is capable of replacing humans entirely.

                      "},{"location":"spring2021/lecture-9/#the-alignment-problem","title":"The Alignment Problem","text":"

                      What\u2019s common in all these long-term problems is the alignment problem. This notion is often expressed by the parable of the \u201cpaperclip maximizer\u201d - given the goal of producing paperclips, an AGI will eventually turn every atom space into paperclips. This is an old lesson about how to establish and communicate our goals and values to technologies precisely.

                      The guiding principle to build safe AI is that the AI systems we build need to be aligned with our goals and values. This is a deep topic and active research area in many places (including CHAI at Berkeley). As a matter of fact, this alignment lens is useful for near-term problems as well, as discussed in the rest of the lecture.

                      "},{"location":"spring2021/lecture-9/#3-hiring","title":"3 - Hiring","text":"

                      Let\u2019s say we are building an ML model to predict hiring decisions given a resume (inspired by this Reuters article about Amazon\u2019s hiring algorithm).

                      What should the data contain? Should it be the hiring decision that was made? Or should it be the eventual job performance given the person that was hired?

                      The data comes from the world, which is known to be biased in many ways: the hiring pipeline (not enough women educated for a software engineering job), the hiring decisions (employers intentionally or unintentionally select people that match some prejudice), the performance ratings (people get promoted not because they are good of their job, but because they match other expectations of the promoter).

                      Because the world is biased, the data will be biased no matter how we structure the data. Therefore, the model trained on that data will be biased.

                      The model will be used to aid or make an action: sourcing candidates, double-checking human decisions, or making the actual hiring decisions? In the last case, that action will amplify existing biases.

                      Amplifying existing biases is not aligned with our goals and values!\ud83d\ude20

                      "},{"location":"spring2021/lecture-9/#4-fairness","title":"4 - Fairness","text":""},{"location":"spring2021/lecture-9/#compas","title":"COMPAS","text":"

                      Let\u2019s look at a case study about COMPAS - Correctional Offender Management Profiling for Alternative Sanctions system to discuss fairness.

                      • The goal of this system is to predict recidivism (committing another crime), such that judges can consult a 1-10 score in pre-trial sentencing decisions.

                      • The motivation of this system is to be less biased than humans because the criminal justice system is notoriously biased against certain races.

                      • The solution of this system is to (1) gather relevant data, (2) exclude protected class attributes (race, gender, age, etc.), (3) train the model by ensuring that the model\u2019s score corresponds to the same probability of recidivism across all demographic groups.

                      And yet, this famous ProPublica report exposes the bias of this system against blacks.

                      "},{"location":"spring2021/lecture-9/#fairness-definitions-from-aravind-narayanans-lecture","title":"Fairness Definitions (From Aravind Narayanan\u2019s Lecture)","text":"

                      There are a bunch of fairness definitions. The first one concerns bias. We often mean statistical bias in machine learning - the difference between the model\u2019s expected value and the true value.

                      • In this sense, the COMPAS scores are not biased with respect to re-arrest. This is an important caveat; because we only have data for arrests, not crimes committed. There may well be bias in arrests (the data-generating process).

                      • Even if COMPAS is free of statistical bias, is it an adequate fairness criterion? Is this criterion aligned with human values?

                      Taking a step back and look at the classic binary classification problem setup, we have the confusion matrix as seen above. The interesting question to ask is what do different stakeholders want from the classifier?

                      • The decision-maker (the judge or the prosecutor) asks: \u201cOf those that I have labeled high risk, how many recidivated?\u201d This corresponds to the model\u2019s predictive value = TP / (TP + FP).

                      • The defendant asks: \u201cWhat is the probability I\u2019ll be incorrectly classified as high risk?\u201d This corresponds to the model\u2019s false positive rate = FP / (FP + FN).

                      • The society at large might care about: \u201cIs the selected set demographically balanced?\u201d This could be demographic parity, which leads to the definition of group fairness (\u201cDo outcomes differ between groups, which we have no reason to believe are actually different?\u201d).

                      A lot of these group fairness metrics have natural motivations, so there\u2019s not a single correct fairness definition. They depend on the politics of the situation.

                      Let\u2019s forget about demographic parity and only pick the two most important metrics (false-positive rate and false-negative rate) while allowing the model to use protected class attributes. We fail the individual fairness definition, which uses a single threshold for the sentencing decision or the pre-sentencing release decision.

                      Even if we pick one metric to optimize for, we still sacrifice some utility (providing public safety or releasing too few defendants).

                      To build more intuition, you should play around with this interactive demo on attacking discrimination with smarter ML from Google Research.

                      Finally, ML can be very good at finding patterns that maybe humans can\u2019t find. For instance, your ZIP code and age might be highly correlated with your race. That means the model can always pick up from a protected class attribute from other attributes. Read this paper on Equality of Opportunity in Supervised Learning for more detail.

                      "},{"location":"spring2021/lecture-9/#tradeoffs","title":"Tradeoffs","text":"

                      There are tradeoffs between different measures of group fairness, between the definitions of group fairness and individual fairness, and between the notions of fairness and utility. In fact, these tradeoffs are not specific to machine learning. They apply to human decision making too. There is also a tension between disparate treatment and disparate impact, which is another deep subject.

                      "},{"location":"spring2021/lecture-9/#seeing-the-water","title":"Seeing The Water","text":"

                      In order to see the water, it would be noteworthy to think about the differences between environmental equity and environmental justice:

                      • Equality: The assumption is that everyone benefits from the same supports. This is equal treatment.

                      • Equity: Everyone gets the support they need (\u201caffirmative action\u201d), thus producing equity.

                      • Justice: All parties are supported equally because the cause of the inequity was addressed. The systematic barrier has been removed.

                      The justice mindset is valuable to have. As computer scientists, we have very literal minds and argue for the rationality of our choices. But taking a step back and seeing the whole situation would be even more crucial.

                      "},{"location":"spring2021/lecture-9/#5-representation","title":"5 - Representation","text":""},{"location":"spring2021/lecture-9/#the-problem","title":"The Problem","text":"

                      Watch this simple video: a hand sanitizer dispenser that doesn\u2019t recognize racially diverse hands. It\u2019s a small example but illustrates a big problem: a lack of attention to diverse representation in the development of technology products. This occurs across fields, such as drug development, photography, etc. As pointed out by Timnit Gebru in this New York Times article, the exclusion of people from certain backgrounds poses a serious long-term threat to the viability of ML systems.

                      One way to address this challenge head-on is to focus on the inclusion of people from all backgrounds. Groups like Black in AI, Women in Machine Learning, and Latinx in AI play a big role in building communities of underrepresented people and inviting them into the AI/ML industry. Another is to deliberately ensure products reflect inclusive values. For example, Google Images now yields a diverse set of images for the search term \u201cCEO\u201d whereas it used to return entirely white, middle-aged men.

                      "},{"location":"spring2021/lecture-9/#word-embeddings","title":"Word Embeddings","text":"

                      A particularly relevant example of bias in machine learning is the underlying bias in the Word2Vec model. Word2Vec introduced vector math for word embeddings and is frequently used for NLP applications. The original model was trained on a large corpus, and the weights were open-sourced. As these weights were examined, underlying bias in the word logic was discovered. Terms like \u201cdoctor\u201d and \u201cprogrammer\u201d were associated with men, while \u201chomemaker\u201d and \u201cnurse\u201d were associated with women. Translating our existing biases like these into the ML domain is undesirable, to say the least! \ud83d\ude29

                      One potential solution to address this problem is to de-bias at training time with carefully screened data. With newer models like GPT-3 that are trained on massive swathes of data, this can be hard to do in practice. Bender and Gebru advise in a 2021 paper to reduce the dependence on large, unfiltered datasets and more carefully document the data-generating process. Alternatively, you can alert the user proactively of potential bias. Addressing this problem of bias in language models is an open problem.

                      "},{"location":"spring2021/lecture-9/#seeing-the-water_1","title":"Seeing The Water","text":"

                      Part of the challenge lies in agreeing on whether the model should learn about the world as it is in the data or learn about the world in a more idealistic manner. This is application-specific. A model recognizing hate speech on Facebook should probably learn about the world as it is, or a model interacting with humans\u2019 conversations should adhere to proper ideals. Of course, this begs the question of who decides what ideals are desirable and suitable for a model to follow. Consider these questions as you build models for various applications.

                      Ultimately, these challenges in machine learning systems development are rooted in ethics. Face recognition is a boundary-breaking area that has been grappling with ethical concerns. Importantly, face recognition illustrates how technology can impact ethics and change standards. Is the loss of privacy associated with face recognition desirable? Relatedly, are face recognition systems performing well across groups? The question of performance should generally follow ethics to avoid distracting from the fundamental ethical issues (e.g., civil rights, privacy, etc.).

                      "},{"location":"spring2021/lecture-9/#6-best-practices","title":"6 - Best Practices","text":"

                      A recent survey of ML practitioners found these to be the top challenges in ensuring fairness that they face:

                      1. Receiving support in fairness-aware data collection and curation

                      2. Overcoming team\u2019s blind spots

                      3. Implementing more proactive fairness auditing processes

                      4. Auditing complex ML systems

                      5. Deciding how to address particular instances of unfairness

                      6. Addressing biases in the humans embedded throughout the ML development pipeline

                      "},{"location":"spring2021/lecture-9/#suggestions","title":"Suggestions","text":"

                      Rachel Thomas, the co-creator of Fast.ai, has some great ideas on how to confront fairness issues proactively:

                      1. Perform ethical risk sweeping. Akin to cybersecurity penetration testing, where engineers intentionally try to find faults, you can try to engage in regular fairness checks on behalf of different stakeholders.

                      2. Expand the ethical circle. Try to consider different perspectives than yours regularly, and invite such people into your decision-making \u201ccircle\u201d to ensure that systems do not lead to unfair outcomes.

                      3. Think about worst-case scenarios. What incentives may crop up for people to engage in unethical behavior? For example, the upvote-downvote system and recommendations on Reddit can cause toxic behavior. Think about such incentives and requisite safeguards in advance.

                      4. Close the loop! You have to put in place a process to keep improving, as fairness is not a static test (just like raw performance).

                      One powerful tool, proposed by Gebru and Mitchell in 2018, is adopting \u201cmodel cards.\u201d For every ML model, make a simple page that discusses the expectations (i.e., input/output), tradeoffs, performance, and known limitations. Engaging in this documentation exercise allows for teams to confront fairness issues head-on more effectively. The objective here is to get everyone on the same page about what the model can and cannot do from a fairness perspective. We believe everyone should do this, considering how easy it is. Other methods like bias audits are also useful, as the Aequitas team at UChicago shows.

                      "},{"location":"spring2021/lecture-9/#a-code-of-ethics","title":"A Code of Ethics?","text":"

                      AI is a reflection of society. It\u2019s impossible to expect AI to be completely unbiased when humans still struggle with the problem. However, we can try our best to ensure that these biases are not amplified by AI and mitigate any such damage. Making fairness and ethics a routine part of AI development by professionals and teams is crucial to addressing the challenge. Perhaps an AI code of ethics (akin to the Hippocratic Oath) would make sense!

                      "},{"location":"spring2021/lecture-9/#7-where-to-learn-more","title":"7 - Where To Learn More","text":"

                      Here are some links to learn more:

                      1. https://ethics.fast.ai/: a course by the fast.ai team on practical data ethics consisting of 6 lectures.
                      2. CS 294: Fairness in Machine Learning: A graduate course (similar to FSDL) taught at Berkeley in 2017 about AI ethics.
                      3. Fair ML Book: A book being written by the instructor of the aforementioned course on fair ML.
                      4. KDD Tutorial on Fair ML: Taught by folks from CMU, this is a workshop addressing some of the topics in this lecture.
                      5. The Alignment Problem: a book that confronts present-day issues in AI alignment.
                      6. Weapons of Math Destruction: a popular book about current issues like Facebook\u2019s News Feed.
                      "},{"location":"spring2021/notebook-1/","title":"Notebook: Coding a neural net","text":""},{"location":"spring2021/notebook-1/#video","title":"Video","text":"

                      Notebook by Sergey Karayev.

                      In this video, we code a neural network from scratch. You'll get familiar with the Google Colab environment, create a simple linear regression model using only Numpy, and build a multi-layer perception regression model using NumPy, PyTorch, and Keras.

                      • 0:30\u200b - Colab Notebook 101
                      • 5:30\u200b - Numerical computing via NumPy
                      • 10:15\u200b - Plotting via Matplotlib
                      • 11:33\u200b - Basic regression with a linear model
                      • 24:30\u200b - Basic regression with a multi-layer perceptron
                      "},{"location":"spring2021/notebook-1/#follow-along","title":"Follow Along","text":"

                      Google Colab

                      "},{"location":"spring2021/panel/","title":"Panel Discussion: Do I need a PhD to work in ML?","text":"

                      We gathered a few people to offer different perspectives on whether grad school is required to work on interesting ML problems.

                      Moderated by Sergey Karayev.

                      The panelists:

                      • Pieter Abbeel - Professor at UC Berkeley
                      • Georgia Gkioxari - Research Scientist at Facebook, formerly PhD Berkeley
                      • Peter Gao - Co-founder and CEO of Aquarium Learning, formerly Cruise Automation
                      • Anil Jason - Co-founder and CTO of Quillbot
                      "},{"location":"spring2021/panel/#video","title":"Video","text":""},{"location":"spring2021/projects/","title":"Course Projects Showcase","text":"

                      The final project is the most important as well as the most fun part of the course. Students worked individually or in pairs over the duration of the course to complete a project involving any part of the full stack of deep learning.

                      The Top 10 projects, as selected by our course TAs, we viewed together with everyone, and posted the video on YouTube.

                      The details of these and other projects are below

                      "},{"location":"spring2021/projects/#artificial-manga-panel-dataset","title":"Artificial Manga Panel Dataset","text":"
                      • By Aasim Sani
                      • GitHub Page
                      • Project Report
                      • Video Presentations: Part 1 and Part 2
                      "},{"location":"spring2021/projects/#breast-cancer-detection-assistant","title":"Breast Cancer Detection Assistant","text":"
                      • By Harish Narayanan and Daniel Hen
                      • Project Report
                      • Source Code
                      • Video Presentation
                      • ML Video Presentation
                      "},{"location":"spring2021/projects/#human-protein-atlas-single-cell-classification","title":"Human Protein Atlas Single Cell Classification","text":"
                      • By Dariusz Kleczek
                      • Report
                      • Video Presentation
                      "},{"location":"spring2021/projects/#a-case-study-on-weakly-supervised-learning","title":"A Case Study on Weakly Supervised Learning","text":"
                      • By Jacques Thibodeau, Arian Pasquali, and Kevin Koehncke
                      • Project Report
                      • Source Code
                      • Video Presentation
                      "},{"location":"spring2021/projects/#real-time-nail-biting-alerter","title":"Real-Time Nail Biting Alerter","text":"
                      • By Jean-Marie Prigent
                      • Netlify Site
                      • Source Code
                      • Video Presentation
                      "},{"location":"spring2021/projects/#youtube-highlighter-for-creators-and-their-supporters","title":"YouTube Highlighter for Creators and Their Supporters","text":"
                      • By Keno Harada
                      • Slides
                      • Source Code
                      • Video Presentation
                      • Blog Post (in Japanese)
                      "},{"location":"spring2021/projects/#neural-rock","title":"Neural Rock","text":"
                      • By Lukas Mosser and co.
                      • Source Code
                      • Video Presentation
                      "},{"location":"spring2021/projects/#unity-machine-learning-agents","title":"Unity Machine Learning Agents","text":"
                      • By Paul Solomon
                      • Source Code
                      • Video Presentation
                      "},{"location":"spring2021/projects/#sitting-posture-coach","title":"Sitting Posture Coach","text":"
                      • By Peter De Roovere
                      • Source Code
                      • Video Presentation
                      "},{"location":"spring2021/projects/#intel-scene-classification","title":"Intel Scene Classification","text":"
                      • By Thomas Paula and Jonathan Salfity
                      • Source Code
                      • Report
                      • Video Presentation
                      "},{"location":"spring2021/projects/#bird-song-classifier","title":"Bird Song Classifier","text":"
                      • By Wendy Mak
                      • Report
                      • Streamlit App
                      "},{"location":"spring2021/projects/#detecting-deforestation-from-satellite-images","title":"Detecting Deforestation from Satellite Images","text":"
                      • By Andre Ferreira and Karthik Bhaskar
                      • Video
                      • Report
                      • Code
                      "},{"location":"spring2021/projects/#active-learning-a-systematic-investigation","title":"Active Learning: A Systematic Investigation","text":"
                      • By Matthias Pfenninger, Stefan Josef, and Ravindra Bharathi
                      • Video
                      • Report
                      • Code
                      "},{"location":"spring2021/projects/#drought-watch","title":"Drought Watch","text":"
                      • By Sambhavi Dhanabalan
                      • Video: Part 1 and Part 2
                      • Report
                      • Code
                      "},{"location":"spring2021/synchronous/","title":"Synchronous Online Course","text":"

                      We offered a paid cohort experience with the following additions to the lecture and lab materials released publicly:

                      • Slack workspace for learners, instructors, and teaching assistants
                      • Weekly graded assignment
                      • Capstone project reviewed by peers and staff
                      • Certificate of completion

                      Check out the original announcement page.

                      "},{"location":"spring2021/synchronous/#how-do-i-know-if-i-am-in-this-course","title":"How do I know if I am in this course?","text":"

                      If you registered and received an email receipt from Stripe, you're in, and should have been added to our Slack workspace on February 1.

                      Please email us if you have a Stripe receipt but aren't in our Slack.

                      "},{"location":"spring2021/synchronous/#teaching-assistants","title":"Teaching Assistants","text":"

                      This course is only possible with the support of our amazing TAs:

                      • Head TA James Le runs Data Relations for Superb AI and contributes to Data Journalism for Snorkel AI, after getting an MS in Recommendation Systems at RIT.
                      • Daniel Cooper is a machine learning engineer at QuantumWork, SaaS for recruiters.
                      • Han Lee is a Senior Data Scientist at WalletHub. Prior to that, he worked on various DS, MLE, and quant roles. Previously, he co-managed TEFQX.
                      • Nadia Ahmed is a machine learning researcher with The Frontier Development Lab and Trillium Technologies in remote sensing for severe weather and flood events.
                      • Andrew Mendez is a Senior Machine Learning Engineer at Clarifai, developing large scale computer vision and machine learning systems for the public sector. Previously he was a ML Engineer at CACI.
                      • Vishnu Rachakonda is a Machine Learning Engineer at Tesseract Health, a retinal imaging company, where he builds machine learning models for workflow augmentation and diagnostics in on-device and cloud use cases.
                      • Chester Chen is the Director of Data Science Engineering at GoPro. He also founded the SF Big Analytics Meetup.
                      "},{"location":"spring2021/synchronous/#schedule","title":"Schedule","text":"

                      While we post lectures once a week starting February 1, the first four weeks are review lectures -- stuff you should already know from other courses.

                      On March 1, we get to the Full Stack content, and you will begin doing weekly assignments, discussing in Slack, and thinking about their course project.

                      "},{"location":"spring2021/synchronous/#logistics","title":"Logistics","text":"

                      All learners, instructors, and TAs will be part of a Slack workspace. The Slack community is a crucial part of the course: a place to meet each other, post helpful links, share experiences, ask and answer questions.

                      On Monday, we post the lecture and lab videos for you to watch. Post questions, ideas, articles in Slack as you view the materials.

                      On Thursday, we go live on Zoom to discuss the posted questions and ideas. We have two 30-min slots: 9am and 6pm Pacific Time. We will send everyone a Google Calendar invite with the Zoom meeting information.

                      You have until Friday night to finish the assignment via Gradescope, which will be graded by next Tuesday, so that you have prompt feedback.

                      Labs are not graded and can be done on your own.

                      "},{"location":"spring2021/synchronous/#projects","title":"Projects","text":"

                      The final project is the most important as well as the most fun part of the course. You can pair up or work individually. The project can involve any part of the full stack of deep learning, and should take you roughly 40 hours per person, over 5 weeks.

                      Projects will be presented as five-minute videos and associated reports, and open sourcing the code is highly encouraged. All projects will be posted for peer and staff review.

                      The best projects will be awarded and publicized by Full Stack Deep Learning.

                      If you want to find a partner, please post in the #spring2021-projects Slack channel with your idea or just that you're available to pair up.

                      Project proposals are due on Gradescope a few weeks into the course.

                      Please read more information about the projects.

                      "},{"location":"spring2021/synchronous/#certificate","title":"Certificate","text":"

                      Those who complete the assignments and project will receive a certificate that can, for example, be displayed on LinkedIn.

                      "},{"location":"spring2021/synchronous/#time-commitment","title":"Time Commitment","text":"

                      On average, expect to spend 5-10 hours per week on the course.

                      "}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 00000000..be4f6c35 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,428 @@ + + + + https://fullstackdeeplearning.com/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/blog/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/blog/posts/_template/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/blog/posts/running-llm-glm-130b/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/blog/posts/rwkv-explainer/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/cloud-gpus/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/conference/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/conference/sponsors/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/announcement/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/cohort/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lab-0-overview/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lab-4-experiment-management/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lab-5-troubleshooting-and-testing/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lab-6-data-annotation/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lab-7-web-deployment/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lab-8-model-monitoring/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/labs-1-3-cnns-transformers-pytorch-lightning/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-1-course-vision-and-when-to-use-ml/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-2-development-infrastructure-and-tooling/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-3-troubleshooting-and-testing/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-4-data-management/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-5-deployment/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-6-continual-learning/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-7-foundation-models/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-8-teams-and-pm/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/lecture-9-ethics/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/course/2022/project-showcase/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/expense/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/sponsors/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/expense/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/sponsors/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/askfsdl-walkthrough/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/askfsdl-walkthrough/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/augmented-language-models/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/augmented-language-models/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/chase-agents/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/chase-agents/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/launch-an-llm-app-in-one-hour/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/llm-foundations/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/llm-foundations/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/llmops/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/llmops/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/prompt-engineering/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/prompt-engineering/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/shabani-train-your-own/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/shabani-train-your-own/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/ux-for-luis/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/ux-for-luis/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/welinder-fireside-chat/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/welinder-fireside-chat/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/whats-next/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/llm-bootcamp/spring-2023/whats-next/chapter_summaries/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-1/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-2/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-3/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-4/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-5/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-6/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-7/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-8/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lab-9/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-1/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-10/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-11/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-12/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-13/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-2a/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-2b/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-3/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-4/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-5/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-6/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-7/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-8/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/lecture-9/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/notebook-1/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/panel/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/projects/ + 2023-11-24 + daily + + + https://fullstackdeeplearning.com/spring2021/synchronous/ + 2023-11-24 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 00000000..bfdf1462 Binary files /dev/null and b/sitemap.xml.gz differ diff --git a/spring2021/index.html b/spring2021/index.html new file mode 100644 index 00000000..cd50a11a --- /dev/null +++ b/spring2021/index.html @@ -0,0 +1,2460 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Course 2021 - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Full Stack Deep Learning - Course Spring 2021

                      +
                      +

                      Info

                      +

                      This is the page for the 2021 edition of the course. For the 2022 edition, click here.

                      +
                      +

                      We've updated and improved our materials for our 2021 course taught at UC Berkeley and online.

                      +
                      +

                      Synchronous Online Course

                      +

                      We offered a paid synchronous option for those who wanted weekly assignments, capstone project, Slack discussion, and certificate of completion.

                      +

                      Enter your email below or follow us on Twitter to be the first to hear about future offerings of this option.

                      +
                      + +

                      +
                      +
                      +
                      + + + + +
                      +
                      +
                      +
                      + + +

                      Week 1: Fundamentals

                      +

                      We do a blitz review of the fundamentals of deep learning, and introduce the codebase we will be working on in labs for the remainder of the class.

                      + +

                      Reading:

                      +

                      How the backpropagation algorithm works

                      +

                      Week 2: CNNs

                      +

                      We cover CNNs and Computer Vision Applications, and introduce a CNN in lab.

                      + +

                      Reading:

                      +

                      A brief introduction to Neural Style Transfer

                      +

                      Improving the way neural networks learn

                      +

                      Week 3: RNNs

                      +

                      We cover RNNs and applications in Natural Language Processing, and start doing sequence processing in lab.

                      + +

                      Reading:

                      +

                      The Unreasonable Effectiveness of Recurrent Neural Networks

                      +

                      Attention Craving RNNS: Building Up To Transformer Networks

                      +

                      Week 4: Transformers

                      +

                      We talk about the successes of transfer learning and the Transformer architecture, and start using it in lab.

                      + +

                      Reading:

                      +

                      Transformers from Scratch

                      +

                      Week 5: ML Projects

                      +

                      Our synchronous online course begins with the first "Full Stack" lecture: Setting up ML Projects.

                      + +

                      Reading:

                      +

                      Rules of Machine Learning

                      +

                      ML Yearning (and subscribe to Andrew Ng's newsletter)

                      +

                      Those in the syncronous online course will have their first weekly assignment: Assignment 1, available on Gradescope.

                      +

                      Week 6: Infra & Tooling

                      +

                      We tour the landscape of infrastructure and tooling for deep learning.

                      + +

                      Reading:

                      +

                      Machine Learning: The High-Interest Credit Card of Technical Debt

                      +

                      Those in the syncronous online course will have to work on Assignment 2.

                      +

                      Week 7: Troubleshooting

                      +

                      We talk about how to best troubleshoot training. +In lab, we learn to manage experiments.

                      + +

                      Reading:

                      +

                      Why is machine learning hard?

                      +

                      Those in the syncronous online course will have to work on Assignment 3.

                      +

                      Week 8: Data

                      +

                      We talk about Data Management, and label some data in lab.

                      + +

                      Reading:

                      +

                      Emerging architectures for modern data infrastructure

                      +

                      Those in the syncronous online course will have to work on Assignment 4.

                      +

                      Week 9: Ethics

                      +

                      We discuss ethical considerations. +In lab, we move from lines to paragraphs.

                      + +

                      Those in the synchronous online course will have to submit their project proposals.

                      +

                      Week 10: Testing

                      +

                      We talk about Testing and Explainability, and set up Continuous Integration in lab.

                      + +

                      Those in the synchronous online course will work on their projects.

                      +

                      Week 11: Deployment

                      +

                      We cover Deployment and Monitoring, and package up our model for deployment in lab.

                      + +

                      Those in the synchronous online course will work on their projects.

                      +

                      Week 12: Research

                      +

                      We talk research, and set up robust monitoring for our model.

                      + + + +

                      Those in the synchronous online course will work on their projects.

                      +

                      Week 13: Teams

                      +

                      We discuss ML roles and team structures, as well as big companies vs startups.

                      + +

                      Week 14-16: Projects

                      +

                      Those in the synchronous online course will submit 5-minute videos of their projects and associated write-ups by May 15.

                      +

                      Check out the course projects showcase.

                      +

                      Other Resources

                      +

                      Fast.ai is a great free two-course sequence aimed at first getting hackers to train state-of-the-art models as quickly as possible, and only afterward delving into how things work under the hood. Highly recommended for anyone.

                      +

                      Dive Into Deep Learning is a great free textbook with Jupyter notebooks for every part of deep learning.

                      +

                      NYU’s Deep Learning course has excellent PyTorch breakdowns of everything important going on in deep learning.

                      +

                      Stanford’s ML Systems Design course has lectures that parallel those in this course.

                      +

                      The Batch by Andrew Ng is a great weekly update on progress in the deep learning world.

                      +

                      /r/MachineLearning/ is the best community for staying up to date with the latest developments.

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-1/index.html b/spring2021/lab-1/index.html new file mode 100644 index 00000000..8d39ba9d --- /dev/null +++ b/spring2021/lab-1/index.html @@ -0,0 +1,2292 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 1: Setup and Introduction - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 1: Setup and Introduction

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +

                      In this video, we introduce the lab throughout the course. We formulate the problem, provide the codebase structure, and train a simple Multilayer Perceptron on the MNIST dataset.

                      +
                        +
                      • 4:11 - Understand the problem and path to solution
                      • +
                      • 5:54 - Set up the computing environment
                      • +
                      • 12:54 - Review the codebase
                      • +
                      • 24:55 - Train the MLP model on MNIST
                      • +
                      +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Follow Along

                      +

                      GitHub Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-2/index.html b/spring2021/lab-2/index.html new file mode 100644 index 00000000..f1e44dc6 --- /dev/null +++ b/spring2021/lab-2/index.html @@ -0,0 +1,2272 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 2: CNNs and Synthetic Data - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 2: CNNs and Synthetic Data

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +

                      In this lab, you train a single-line ConvNet predictor on the EMNIST dataset and then synthetically generate your own data.

                      +
                        +
                      • 00:00 - Introduction
                      • +
                      • 05:23 - Look at the EMNIST dataset
                      • +
                      • 09:52 - Train a base ConvNet model
                      • +
                      • 12:43 - Examine the ConvNet code
                      • +
                      • 17:33 - Lab 2 homework
                      • +
                      • 19:35 - Make a synthetic dataset of EMNIST lines
                      • +
                      +

                      Follow Along

                      +

                      GitHub Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-3/index.html b/spring2021/lab-3/index.html new file mode 100644 index 00000000..191a9d78 --- /dev/null +++ b/spring2021/lab-3/index.html @@ -0,0 +1,2271 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 3: RNNs - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 3: RNNs

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +
                        +
                      • 00:00 - Introduction.
                      • +
                      • 01:59 - Introduce LineCNNSimple, a model that can read multiple characters in an image.
                      • +
                      • 15:52 - Make this model more efficient with LineCNN, which uses a fully convolutional network.
                      • +
                      • 18:18 - Upgrade the model further into LitModelCTC, which uses a CTC (Connectionist Temporal Classification) loss.
                      • +
                      • 23:29 - Finalize your model, LineCNNLSTM, by adding an LSTM layer on top of CNN.
                      • +
                      • 27:34 - Lab 3 homework.
                      • +
                      +

                      Follow along

                      +

                      Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-4/index.html b/spring2021/lab-4/index.html new file mode 100644 index 00000000..00cb45dd --- /dev/null +++ b/spring2021/lab-4/index.html @@ -0,0 +1,2271 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 4: Transformers - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 4: Transformers

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +

                      In this lab, you use the LineCNN + LSTM model with CTC loss from lab 3 as an "encoder" of the image, and then send it through Transformer decoder layers.

                      +
                        +
                      • 00:00 - Introduction
                      • +
                      • 01:43 - LineCNNTransformer class
                      • +
                      • 04:50 - TransformerLitModel
                      • +
                      • 06:51 - Code to make predictions
                      • +
                      • 08:50 - Training guidelines
                      • +
                      +

                      Follow along

                      +

                      Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-5/index.html b/spring2021/lab-5/index.html new file mode 100644 index 00000000..2174ef1c --- /dev/null +++ b/spring2021/lab-5/index.html @@ -0,0 +1,2272 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 5: Experiment Management - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 5: Experiment Management

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +

                      In this lab, we'll use Weights and Biases to manage experiments for our handwriting recognition model.

                      +
                        +
                      • 00:00 - Introduction
                      • +
                      • 00:56 - IAMLines Dataset
                      • +
                      • 05:29 - Make EMNISTLines more like IAMLines
                      • +
                      • 09:57 - Set up Weights and Biases
                      • +
                      • 13:42 - Run experiments on Weights and Biases
                      • +
                      • 22:58 - Configure W&B sweeps to search for hyper-parameters
                      • +
                      +

                      Follow along

                      +

                      Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-6/index.html b/spring2021/lab-6/index.html new file mode 100644 index 00000000..2af52388 --- /dev/null +++ b/spring2021/lab-6/index.html @@ -0,0 +1,2264 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 6: Data Labeling - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 6: Data Labeling

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +

                      In this lab we will annotate some handwriting samples that we collected using the open-source tool Label Studio.

                      +

                      Follow along

                      +

                      Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-7/index.html b/spring2021/lab-7/index.html new file mode 100644 index 00000000..5374e0a0 --- /dev/null +++ b/spring2021/lab-7/index.html @@ -0,0 +1,2270 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 7: Paragraph Recognition - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 7: Paragraph Recognition

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev and Saurabh Bipin Chandra.

                      +

                      In this lab, we will do several things:

                      +
                        +
                      • Move from training on synthetic line data to training on real data -- the IAMLines data module
                      • +
                      • Move from training on line data to training on paragraph data -- the IAMParagraphs data module
                      • +
                      • Automatically save the final model
                      • +
                      • Introduce ParagraphTextRecognizer class to load the model and run inference that we can use in production
                      • +
                      +

                      Follow along

                      +

                      Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-8/index.html b/spring2021/lab-8/index.html new file mode 100644 index 00000000..798871c4 --- /dev/null +++ b/spring2021/lab-8/index.html @@ -0,0 +1,2270 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 8: Testing & CI - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 8: Testing & CI

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +

                      In this lab, we

                      +
                        +
                      • Add linting
                      • +
                      • Add prediction tests
                      • +
                      • Add evaluation tests
                      • +
                      • Set up continuous integration using CircleCI
                      • +
                      +

                      Follow along

                      +

                      Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lab-9/index.html b/spring2021/lab-9/index.html new file mode 100644 index 00000000..ece31f6c --- /dev/null +++ b/spring2021/lab-9/index.html @@ -0,0 +1,2270 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lab 9: Web Deployment - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lab 9: Web Deployment

                      +

                      Video

                      + + +

                      Lab by Sergey Karayev.

                      +

                      In this lab, we do several things.

                      +
                        +
                      • First, we speed up our ParagraphTextRecognizer model with TorchScript
                      • +
                      • Next, we wrap the model in a web app, and send it some requests
                      • +
                      • We package up the web app and model as a Docker container, and run it that way
                      • +
                      • Lastly, we prepare to deploy as a serverless function using AWS Lambda, getting it working locally.
                      • +
                      +

                      Follow along

                      +

                      Readme

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-1/index.html b/spring2021/lecture-1/index.html new file mode 100644 index 00000000..a6570c26 --- /dev/null +++ b/spring2021/lecture-1/index.html @@ -0,0 +1,2296 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 1: DL Fundamentals - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lecture 1: DL Fundamentals

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Sergey Karayev.

                      +

                      In this video, we discuss the fundamentals of deep learning. We will cover artificial neural networks, the universal approximation theorem, three major types of learning problems, the empirical risk minimization problem, the idea behind gradient descent, the practice of back-propagation, the core neural architectures, and the rise of GPUs.

                      +

                      This should be a review for most of you; if not, then briefly go through this online book -neuralnetworksanddeeplearning.com.

                      +
                        +
                      • 1:25​ - Neural Networks
                      • +
                      • 6:48​ - Universality
                      • +
                      • 8:48​ - Learning Problems
                      • +
                      • 16:17​ - Empirical Risk Minimization / Loss Functions
                      • +
                      • 19:55​ - Gradient Descent
                      • +
                      • 23:57​ - Backpropagation / Automatic Differentiation
                      • +
                      • 26:09​ - Architectural Considerations
                      • +
                      • 29:01​ - CUDA / Cores of Compute
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-10-notes-media/image1.png b/spring2021/lecture-10-notes-media/image1.png new file mode 100644 index 00000000..61a2e649 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image1.png differ diff --git a/spring2021/lecture-10-notes-media/image10.png b/spring2021/lecture-10-notes-media/image10.png new file mode 100644 index 00000000..6592e947 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image10.png differ diff --git a/spring2021/lecture-10-notes-media/image11.png b/spring2021/lecture-10-notes-media/image11.png new file mode 100644 index 00000000..0e27d2fd Binary files /dev/null and b/spring2021/lecture-10-notes-media/image11.png differ diff --git a/spring2021/lecture-10-notes-media/image12.png b/spring2021/lecture-10-notes-media/image12.png new file mode 100644 index 00000000..f31af8fa Binary files /dev/null and b/spring2021/lecture-10-notes-media/image12.png differ diff --git a/spring2021/lecture-10-notes-media/image13.png b/spring2021/lecture-10-notes-media/image13.png new file mode 100644 index 00000000..a76a9bb8 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image13.png differ diff --git a/spring2021/lecture-10-notes-media/image14.png b/spring2021/lecture-10-notes-media/image14.png new file mode 100644 index 00000000..4e1c3e72 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image14.png differ diff --git a/spring2021/lecture-10-notes-media/image15.png b/spring2021/lecture-10-notes-media/image15.png new file mode 100644 index 00000000..50e94e4c Binary files /dev/null and b/spring2021/lecture-10-notes-media/image15.png differ diff --git a/spring2021/lecture-10-notes-media/image16.png b/spring2021/lecture-10-notes-media/image16.png new file mode 100644 index 00000000..be74eea7 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image16.png differ diff --git a/spring2021/lecture-10-notes-media/image17.png b/spring2021/lecture-10-notes-media/image17.png new file mode 100644 index 00000000..1fb13f90 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image17.png differ diff --git a/spring2021/lecture-10-notes-media/image18.png b/spring2021/lecture-10-notes-media/image18.png new file mode 100644 index 00000000..79923dbe Binary files /dev/null and b/spring2021/lecture-10-notes-media/image18.png differ diff --git a/spring2021/lecture-10-notes-media/image19.png b/spring2021/lecture-10-notes-media/image19.png new file mode 100644 index 00000000..20f73c47 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image19.png differ diff --git a/spring2021/lecture-10-notes-media/image2.png b/spring2021/lecture-10-notes-media/image2.png new file mode 100644 index 00000000..f902535f Binary files /dev/null and b/spring2021/lecture-10-notes-media/image2.png differ diff --git a/spring2021/lecture-10-notes-media/image3.png b/spring2021/lecture-10-notes-media/image3.png new file mode 100644 index 00000000..66f78208 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image3.png differ diff --git a/spring2021/lecture-10-notes-media/image4.png b/spring2021/lecture-10-notes-media/image4.png new file mode 100644 index 00000000..0a6eeff1 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image4.png differ diff --git a/spring2021/lecture-10-notes-media/image5.png b/spring2021/lecture-10-notes-media/image5.png new file mode 100644 index 00000000..1615e283 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image5.png differ diff --git a/spring2021/lecture-10-notes-media/image6.png b/spring2021/lecture-10-notes-media/image6.png new file mode 100644 index 00000000..3af7408b Binary files /dev/null and b/spring2021/lecture-10-notes-media/image6.png differ diff --git a/spring2021/lecture-10-notes-media/image7.png b/spring2021/lecture-10-notes-media/image7.png new file mode 100644 index 00000000..da09a566 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image7.png differ diff --git a/spring2021/lecture-10-notes-media/image8.png b/spring2021/lecture-10-notes-media/image8.png new file mode 100644 index 00000000..ec515e1e Binary files /dev/null and b/spring2021/lecture-10-notes-media/image8.png differ diff --git a/spring2021/lecture-10-notes-media/image9.png b/spring2021/lecture-10-notes-media/image9.png new file mode 100644 index 00000000..ba7397d2 Binary files /dev/null and b/spring2021/lecture-10-notes-media/image9.png differ diff --git a/spring2021/lecture-10/index.html b/spring2021/lecture-10/index.html new file mode 100644 index 00000000..464fc11c --- /dev/null +++ b/spring2021/lecture-10/index.html @@ -0,0 +1,3681 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 10: Testing & Explainability - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 10: Testing & Explainability

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Download notes as PDF

                      +

                      Lecture by Josh Tobin. +Notes transcribed by James Le +and Vishnu Rachakonda.

                      +

                      1 - What’s Wrong With Black-Box Predictions?

                      +

                      What does it mean when we have a good test set performance?

                      +

                      If the test data and production data come from the same +distribution, then in expectation, the performance of your +model on your evaluation metrics will be the same.

                      +

                      Let’s unpack the bolded assumptions:

                      +
                        +
                      • +

                        In the real world, the production distribution does not always +match the offline distribution. You could have data drift, data +shift, or even malicious users trying to attack your model.

                        +
                      • +
                      • +

                        Expected performance does not tell the whole story. For +instance, if you are working on long-tail data distribution, then +the sample of data that you use to evaluate the model offline +might not tell you much about the tail of that distribution - +meaning that your test set score can be misleading. On top of +that, if you evaluate your model with a single metric across your +entire dataset, that does not mean your model is actually +performing well against all the slices of data that might be +important.

                        +
                      • +
                      • +

                        The performance of your model is not equal to the performance of +your machine learning system. There are other things (that can +go wrong with the ML system) that do not have anything to do with +the model.

                        +
                      • +
                      • +

                        Finally, the test set performance only tells you about the metrics +that you are evaluating. In the real world, you are probably not +optimizing the exact metrics you care about deep down.

                        +
                      • +
                      +

                      How bad is this problem? This is a quote from a former ML engineer at an +autonomous vehicle company: “I think the single biggest thing holding +back the autonomous vehicle industry today is that, even if we had a car +that worked, no one would know, because no one is confident that they +know how to evaluate it properly.” We believe that there is a similar +sentiment to lesser degrees in other fields of machine learning, where +the evaluation is the biggest bottleneck.

                      +

                      The goal of this lecture is to introduce concepts and methods to help +you, your team, and your users:

                      +
                        +
                      1. +

                        Understand at a deeper level how well your model is performing.

                        +
                      2. +
                      3. +

                        Become more confident in your model’s ability to perform well in +production.

                        +
                      4. +
                      5. +

                        Understand the model’s performance envelope (where you should +expect it to perform well and where not).

                        +
                      6. +
                      +

                      2 - Software Testing

                      +

                      Types of Tests

                      +

                      There are three basic types of software tests:

                      +
                        +
                      1. +

                        Unit tests that test the functionality of a single piece of code +(an assertion on a single function or a single class) in +isolation.

                        +
                      2. +
                      3. +

                        Integration tests that test how two or more units perform when +used together (e.g., test if a model works well with a +pre-processing function).

                        +
                      4. +
                      5. +

                        End-to-end tests that test how the entire software system +performs when all units are put together (e.g., test on realistic +inputs from a real user).

                        +
                      6. +
                      +

                      Testing is a broad field, so you will likely encounter various other +kinds of tests as well.

                      +

                      Best Practices

                      +

                      Here are a couple of “uncontroversial” testing best practices:

                      +
                        +
                      • +

                        Automate your tests: You have tests that run by themselves +(typically via a CI/CD system) without a user committing an +action. There should be no ambiguity on whether your system +performs up to standard on the tests that are being run.

                        +
                      • +
                      • +

                        Make sure your tests are reliable, run fast, and go through the +same code review process as the rest of your code: The number of +tests grows in proportion to the size of your codebase. If your +tests are unreliable, then people will start ignoring them. If +your tests are slow, then you won’t want to run them frequently +during development. If your tests do not undergo the code review +process, they will have bugs, and it’s better not to have them at +all.

                        +
                      • +
                      • +

                        Enforce that tests must pass before merging into the main +branch: This is a good norm for teams with more than one person. +This is a forcing function to make sure that everyone is committed +to writing good tests and can also be helpful for regulatory +concerns.

                        +
                      • +
                      • +

                        When you find new production bugs, convert them to tests: This +ensures that someone does not accidentally reintroduce those bugs +in the future.

                        +
                      • +
                      • +

                        Follow the testing pyramid: Introduced by +Google, +it says that you should write a lot more unit tests than +integration tests and a lot more integration tests than end-to-end +tests. Compared to end-to-end tests, unit tests are faster, more +reliable, and better at isolating failures. The rule of thumb that +Google recommends (as a rough split) is 70% unit tests, 20% +integration tests, and 10% end-to-end tests.

                        +
                      • +
                      +

                      +

                      Next up, let’s discuss a few “controversial” testing best practices:

                      +
                        +
                      • +

                        Solitary +tests: The +distinction between a solitary test and a sociable test is that - +solitary testing does not rely on real data from other units, +while sociable testing makes the implicit assumption that other +modules are working.

                        +
                      • +
                      • +

                        Test +coverage: +You get a test coverage score for your codebase, which tells you +what percentage of lines of code in your codebase is called by at +least one test. Test coverage gives you a single metric that +quantifies the quality of your testing suite. However, test +coverage does not measure the right things (in particular, test +quality).

                        +
                      • +
                      • +

                        Test-driven +development: +In principle, you want to create your tests before you write your +code. These tests serve as the specification of how the code +functions. There are not many people who religiously stick to this +methodology of development, but TDD is a valuable tool +nonetheless.

                        +
                      • +
                      +

                      +

                      Testing In Production

                      +

                      The traditional view is that the goal of testing is to prevent shipping +bugs into production. Therefore, by definition, you must do your testing +offline before your system goes into production. However, there are two +caveats:

                      +
                        +
                      1. +

                        Informal surveys reveal that the percentage of bugs found by +automated tests is surprisingly low.

                        +
                      2. +
                      3. +

                        On top of that, modern service-oriented distributed systems (which +are deployed in most software engineering organizations nowadays) +are particularly hard to test. The interactions between the +components can get tricky.

                        +
                      4. +
                      +

                      Here is our philosophy for testing in production: Bugs are inevitable, +so you might as well set up the system so that users can help you find +them.

                      +

                      +

                      There are a few strategies to test in production:

                      +
                        +
                      1. +

                        Canary deployment: Do not roll out the new software version to +all the users right away. Instead, just roll it out to a small +percentage of your users and separately monitor that group’s +behavior.

                        +
                      2. +
                      3. +

                        A/B testing: You can run a more principled statistical test if +you have particular metrics that you care about: one for the old +version of the code that is currently running and another for the +new version that you are trying to test.

                        +
                      4. +
                      5. +

                        Real user monitoring: Rather than looking at aggregate metrics +(i.e., click-through rate), try to follow the journey that an +actual user takes through your application and build a sense of +how users experience the changes.

                        +
                      6. +
                      7. +

                        Exploratory testing: Testing in production is not something that +you want to automate fully. It should involve a bit of exploration +(individual users or granular metrics).

                        +
                      8. +
                      +

                      Continuous Integration and Continuous Delivery

                      +

                      CI/CD platforms automate the tests that you run by hooking into your +code repo. When you trigger some actions to take place (pushing new +code, merging new code into a branch, submitting a pull request), CI/CD +platforms kick off a job that is responsible for packaging your code, +running all your tests, producing a report that tells you how well your +code performs on your tests, and gatekeeping whether your new code can +make it to the next stage. Tactically, you can define these jobs as +commands in a Docker container and store the results for later review.

                      +

                      SaaS solutions for continuous integration include +CircleCI and Travis +CI. Most of them do not have GPUs +available. If you are just getting started, the default recommendation +is GitHub Actions, which +is super easy to integrate.

                      +

                      +

                      Jenkins and +Buildkite are manual options for +running continuous integration on your own hardware, in the cloud, or +something in between. There is a lot more flexibility about the types of +jobs you can run through the systems (meaning you can use your GPUs). +The tradeoff is that they are harder to set up.

                      +

                      3 - Testing Machine Learning Systems

                      +

                      There are several core differences between traditional software +systems and ML systems that add complexity to testing ML systems:

                      +
                        +
                      • +

                        Software consists of only code, but ML combines code and data.

                        +
                      • +
                      • +

                        Software is written by humans to solve a problem, while ML is +compiled by optimizers to satisfy a proxy metric.

                        +
                      • +
                      • +

                        Software is prone to loud failures, while ML is prone to silent +failures.

                        +
                      • +
                      • +

                        Software tends to be relatively static (in principle), while ML is +constantly changing.

                        +
                      • +
                      +

                      Due to such differences, here are common mistakes that teams make +while testing ML systems:

                      +
                        +
                      • +

                        Think the ML system is just a model and only test that model.

                        +
                      • +
                      • +

                        Not test the data.

                        +
                      • +
                      • +

                        Not build a granular enough understanding of the performance of the +model before deploying it.

                        +
                      • +
                      • +

                        Not measure the relationship between model performance metrics and +business metrics.

                        +
                      • +
                      • +

                        Rely too much on automated testing.

                        +
                      • +
                      • +

                        Think offline testing is enough, and therefore, not monitor or test +in production.

                        +
                      • +
                      +

                      +

                      Above is the diagram of how you can think of your entire production ML +system that straddles across the offline and online environments:

                      +
                        +
                      • +

                        Sitting in the middle is your ML model - an artifact created by +your training process, which takes in an input and produces an +output.

                        +
                      • +
                      • +

                        The training system takes code and data as inputs and produces +the trained model as the output.

                        +
                      • +
                      • +

                        The prediction system takes in and pre-processes the raw data, +loads the trained ML model, loads the model weights, calls +model.predict() on the data, post-processes the outputs, and +returns the predictions.

                        +
                      • +
                      • +

                        Once you deploy your prediction system to the online environment, +the serving system takes in requests from users, scales up and +down to meet the traffic demands, and produces predictions back to +those users.

                        +
                      • +
                      • +

                        The whole ML system closes the loop by collecting production +data (both the predictions that the model produces and +additional feedback from users, business metrics, or labelers) and +sending them back to the offline environment.

                        +
                      • +
                      • +

                        The labeling system takes the raw data seen in production, helps +you get inputs from labelers, and provides labels for that data.

                        +
                      • +
                      • +

                        The storage and pre-processing system stores and pre-processes +the labeled data before passing it back to the training system.

                        +
                      • +
                      +

                      One way to think about how to test ML systems the right way is to +think about the tests that you can run for each system component and +across the border of these components.

                      +

                      Infrastructure Tests

                      +

                      +

                      Infrastructure tests are unit tests for your training system. They +help you avoid bugs in the training pipeline. You can unit test your +training code like any other code. Another common practice is to add +single-batch or single-epoch tests that check performance after an +abbreviated training run on a tiny dataset, which catches obvious +regressions to your training code. Tactically, you should run +infrastructure tests frequently during the development process.

                      +

                      Training Tests

                      +

                      +

                      Training tests are integration tests between your data system and your +training system. They make sure that training jobs are reproducible.

                      +
                        +
                      • +

                        You can pull a fixed dataset and run a full or abbreviated training +run on it. Then, you want to check and ensure that the model +performance on the newly trained model remains consistent with the +reference performance.

                        +
                      • +
                      • +

                        Another option is to pull a sliding window of data (maybe a new +window for every few days) and run training tests on that window.

                        +
                      • +
                      • +

                        Tactically, you should run training tests periodically, ideally +nightly for frequently changing codebase.

                        +
                      • +
                      +

                      Functionality Tests

                      +

                      +

                      Functionality tests are unit tests for your prediction system. They +help you avoid regressions in code that makes up your prediction +infrastructure.

                      +
                        +
                      • +

                        You can unit test your prediction code like any other code.

                        +
                      • +
                      • +

                        Specifically for the ML system, you can load a pre-trained model +and test its predictions on a few key examples.

                        +
                      • +
                      • +

                        Tactically, you should run functionality tests frequently during the +development process.

                        +
                      • +
                      +

                      Evaluation Tests

                      +

                      +

                      Evaluation tests are integration tests between your training system +and your prediction system. They make sure that a newly trained model +is ready to go into production. These make up the bulk of what’s unique +about testing ML systems.

                      +
                        +
                      • +

                        At a high level, you want to evaluate your model on all of the +metrics, datasets, and slices that you care about.

                        +
                      • +
                      • +

                        Then, you want to compare the new model to the old and baseline +models.

                        +
                      • +
                      • +

                        Finally, you want to understand the performance envelope of the +new model.

                        +
                      • +
                      • +

                        Operationally, you should run evaluation tests whenever you have a +new candidate model considered for production.

                        +
                      • +
                      +

                      It is important to note that evaluation tests are more than just the +validation score. They look at all the metrics that you care about:

                      +
                        +
                      • +

                        Model metrics: precision, recall, accuracy, L2, etc.

                        +
                      • +
                      • +

                        Behavioral metrics: The goal of behavioral tests is to ensure +the model has the invariances we expect. There are three types of +behavioral tests: (1) invariance tests to assert that the change +in inputs shouldn’t affect outputs, (2) directional tests to +assert that the change in inputs should affect outputs, and (3) +minimum functionality tests to ensure that certain inputs and +outputs should always produce a given result. Behavioral testing +metrics are primarily used in NLP applications and proposed in the +Beyond Accuracy paper by Ribeiro et +al. (2020).

                        +
                      • +
                      • +

                        Robustness metrics: The goal of robustness tests is to +understand the model’s performance envelope (i.e., where you +should expect the model to fail). You can examine feature +importance, sensitivity to staleness, sensitivity to data drift, +and correlation between model performance and business metrics. In +general, robustness tests are still under-rated.

                        +
                      • +
                      • +

                        Privacy and fairness metrics: The goal of privacy and fairness +tests is to distinguish whether your model might be biased against +specific classes. Helpful resources are Google’s Fairness +Indicators +and the Fairness Definitions Explained paper by Verma and +Rubin (2018).

                        +
                      • +
                      • +

                        Simulation metrics: The goal of simulation tests is to +understand how the model performance could affect the rest of the +system. These are useful when your model affects the real world +(for systems such as autonomous vehicles, robotics, recommendation +systems, etc.). Simulation tests are hard to do well because they +require a model of how the world works and a dataset of different +scenarios.

                        +
                      • +
                      +

                      Instead of simply evaluating the aforementioned metrics on your entire +dataset in aggregate, you should also evaluate these metrics on +multiple slices of data. A slice is a mapping of your data to a +specific category. A natural question that arises is how to pick those +slices. Tools like +What-If and +SliceFinder help surface the +slices where the model performance might be of particular interest.

                      +

                      Finally, evaluation tests help you maintain evaluation datasets for +all of the distinct data distributions you need to measure. Your main +validation or test set should mirror your production distribution as +closely as possible as a matter of principle. When should you add new +evaluation datasets?

                      +
                        +
                      • +

                        When you collect datasets to specify specific edge cases.

                        +
                      • +
                      • +

                        When you run your production model on multiple data modalities.

                        +
                      • +
                      • +

                        When you augment your training set with data not found in production +(synthetic data).

                        +
                      • +
                      +

                      +

                      The report produced by the evaluation system entails the metrics broken +down against each of the data slices. How can you decide whether the +evaluation passes or fails?

                      +

                      At a high level, you want to compare the new model to the previous +model and another fixed older model. Tactically, you can (1) set +thresholds on the differences between the new and the old models for +most metrics, (2) set thresholds on the differences between data slices, +and (3) set thresholds against the fixed older model to prevent slower +performance “leaks.”

                      +

                      Shadow Tests

                      +

                      +

                      Shadow tests are integration tests between your prediction system and +your serving system. They help you catch production bugs before those +bugs meet users. In many settings, models (which are built in frameworks +such as sklearn, Pytorch, TensorFlow, etc.) are developed in isolation +from the existing software system. For example, a model to flag +inappropriate tweets may be developed in TensorFlow on a static set of +data, not directly in the streaming environment of the broader software +architecture. Because the prediction system and the serving system are +developed in different settings with different assumptions and +environments, there are many opportunities for bugs to creep in. These +bugs can be tricky to catch prior to integration, so shadow tests can +help identify them beforehand.

                      +

                      Firstly, shadow tests help you detect bugs in the production +deployment. In the code path you're using to build the production +model, maybe there's some bug there. You want to make sure that you +catch that before users see that bug.

                      +

                      Secondly, shadow tests also help you detect inconsistencies between +the offline model and the online model. There’s a translation step in +the training pipeline in many companies - going from the offline trained +model to the online production model (the model itself, the +preprocessing pipeline, etc.). A common bug source in production ML +systems happens because of the inconsistencies cropping up in that +translation step. A good health check ensures that your actual +production model is producing the exact predictions on a fixed set of +data as the model you have running on your laptop.

                      +

                      Thirdly, shadow tests help you detect issues that don't appear on the +data you have offline but appear on production data.

                      +

                      How do we design shadow tests? These can require a significant amount of +infrastructure, as they are dependent on actual model integration +opportunities being available.

                      +
                        +
                      • +

                        Typical shadow tests involve testing the performance of a +candidate model on real data without returning or acting on the +output. For example, a company may integrate and run a new model +alongside the previous model without returning the output to the +user.

                        +
                      • +
                      • +

                        Analyzing the consistency of the predictions between the two +models can help spot important differences before they impact +production performance.

                        +
                      • +
                      • +

                        Another option is to gather production data, save it offline, and +test the model’s performance on the fresh data offline.

                        +
                      • +
                      +

                      Overall, evaluating the distribution of model predictions in offline vs. +online settings, candidate vs. production, or any similar setting of a +model update before deploying a new model can help you avoid bugs.

                      +

                      A/B Tests

                      +

                      +

                      Shadow tests evaluate the prediction performance of a model as part of +the broader software architecture, but not the impact on users. +A/B tests fill this role. A/B tests are a common practice in +software engineering, especially in web systems. A/B testing is defined +as “a randomized experimentation process wherein two or more versions of +a variable (web page, page element, etc.) are shown to different +segments of website visitors at the same time to determine which version +leaves the maximum impact and drive business metrics.”[1]

                      +

                      +

                      In model evaluation, A/B tests determine the impact of different model +predictions on user and business metrics. One common way of A/B testing +models is to “canary” data or return predictions on a small portion +of the data (i.e., 1% or 10%) to the relevant users. The remaining data +acts as a control and functions under existing system behavior (i.e., an +old model or even no model). Evaluating the difference in metrics +between the two groups can determine the relative impact of your model. +This simple baseline can work well. Adding more statistically +principled splits, which is common in A/B testing, can be a good idea.

                      +

                      Labeling Tests

                      +

                      +

                      Machine learning models operate in a GIGO paradigm: garbage in, garbage +out. To prevent poor quality labels from cropping up and corrupting +the model, you need to unit test the labeling systems and procedures.

                      +
                        +
                      • +

                        You should start by training, certifying, and evaluating individual +labelers, who each play a crucial role in the quality of the +labels.

                        +
                      • +
                      • +

                        A simple and common label quality test is to spot check labels as +they come in by opening up 100 or 1000 labels from a batch and +evaluating them yourself to understand their quality. Using a +performant model’s guidance, you can make this process more +efficient and only look at labels where the model and the labeler +disagree.

                        +
                      • +
                      • +

                        Another test can be to aggregate labels of multiple labels and +measure agreement across labels. The higher the agreement, the +better quality the labels are.

                        +
                      • +
                      • +

                        Using metrics of agreement, you can assign “trust scores” to +labelers based on their performance relative to other labelers and +weigh the labels accordingly.

                        +
                      • +
                      +

                      Expectation Tests

                      +

                      +

                      Expectation tests address the data preprocessing and storage system. +Essentially, they are unit tests for your data. They are designed to +catch data quality issues and bad data before they make their way into +the pipeline.

                      +

                      The typical way that expectation tests operate is rule- or +threshold-based. At each step of the data processing pipeline, the +output should conform to a specific format that matches a rule or +specific format. If the rule or threshold does not pass, then that stage +of the expectation test and the data pipeline’s related step fails. Such +tests are frequently run with batch data pipeline jobs. Great +Expectations is an open-source +library gaining popularity for running expectation tests. The library +allows you to set hard rules for the kinds of values or behaviors (i.e., +statistics) you expect from your data.

                      +

                      How do you set the rules and thresholds for expectation tests? Most +expectation tests are set manually. A more sophisticated option is to +profile a high-quality sample of your data and set thresholds +accordingly. In practice, to avoid false alarms from overly sensitive +tests, a combination of both approaches is needed.

                      +

                      Challenges and Recommendations Operationalizing ML Tests

                      +

                      Running tests is an excellent idea in theory but can pose many practical +challenges for data science and ML teams.

                      +
                        +
                      • +

                        The first challenge is often organizational. In contrast to +software engineering teams for whom testing is table stakes, data +science teams often struggle to implement testing and code review +norms.

                        +
                      • +
                      • +

                        The second challenge is infrastructural. Most CI/CD platforms +don’t support GPUs, data integrations, or other required elements +of testing ML systems effectively or efficiently.

                        +
                      • +
                      • +

                        The third challenge is tooling, which has not yet been +standardized for operations like comparing model performance and +slicing datasets.

                        +
                      • +
                      • +

                        Finally, decision-making for ML test performance is hard. What +is “good enough” test performance is often highly contextual, +which is a challenge that varies across ML systems and teams.

                        +
                      • +
                      +

                      Let’s boil all these lessons for testing down into a clear set of +recommendations specific to ML systems:

                      +
                        +
                      1. +

                        Test each part of the ML system, not just the model. You build the +machine that builds the model, not just the model!

                        +
                      2. +
                      3. +

                        Test code, data, and model performance, not just code.

                        +
                      4. +
                      5. +

                        Testing model performance is an art, not a science. There is a +considerable amount of intuition that guides testing ML systems.

                        +
                      6. +
                      7. +

                        Thus, the fundamental goal of testing model performance is to +build a granular understanding of how well your model performs +and where you don’t expect it to perform well. Using this +intuition derived from testing, you can make better decisions +about productionizing your model effectively.

                        +
                      8. +
                      9. +

                        Build up to this gradually! You don’t need to do everything detailed +in this lecture, and certainly not all at once. Start with:

                        +
                          +
                        1. +

                          Infrastructure tests

                          +
                        2. +
                        3. +

                          Evaluation tests

                          +
                        4. +
                        5. +

                          Expectation tests

                          +
                        6. +
                        +
                      10. +
                      +

                      4 - Explainable and Interpretable AI

                      +

                      Definitions

                      +

                      What do explainable and interpretable AI, buzzwords you’ve undoubtedly +heard before, actually mean? Let’s start by outlining some more +fundamental terms about the problem space:

                      +
                        +
                      • +

                        Domain predictability: the degree to which it is possible to +detect data outside the model’s domain of competence.

                        +
                      • +
                      • +

                        Interpretability: the degree to which a human can consistently +predict the model’s result (Kim et al., +2016).

                        +
                      • +
                      • +

                        Explainability: the degree to which a human can understand the +cause of a decision (Miller, +2017).

                        +
                      • +
                      +

                      We’ll walk through four different methods of making models interpretable +and explainable:

                      +
                        +
                      1. +

                        Use an interpretable family of models.

                        +
                      2. +
                      3. +

                        Distill the complex model to an interpretable one.

                        +
                      4. +
                      5. +

                        Understand the contribution of features to the prediction.

                        +
                      6. +
                      7. +

                        Understand the contribution of training data points to the +prediction.

                        +
                      8. +
                      +

                      Use An Interpretable Family of Models

                      +

                      Examples of interpretable families of models are simple, familiar +models like linear regression, logistic regression, generalized linear +models, and decision trees. If you understand the math of these +models, it’s pretty easy to understand why a model made the decision it +did. Because of the reasonably elementary math, these models are +interpretable and explainable. However, they are not very powerful.

                      +

                      Another class of models that are interpretable is attention models. +Examining where a model is “looking” helps us anticipate a model’s +prediction, thus making them interpretable. However, attention maps are +not particularly explainable. They do not produce complete +explanations for a model’s output, just a directional explanation. +Furthermore, attention maps are not reliable explanations. Attention +maps tell us only where a model is looking, not why it is looking there. +Frequently, models focus exclusively on an image’s salient region +without an underlying reasoning that relates to the task at hand. In the +sample below, the attention model is “looking” at the salient region for +classification, which has a very different meaning in each context.

                      +

                      +

                      The conflation of attention with explanation is a critical pitfall to +avoid.

                      +

                      Distill A Complex To An Interpretable One

                      +

                      Instead of restricting models to only interpretable families, we can fit +a more complex model and interpret its decision using another model from +an interpretable family. The trick is to train this additional model, +referred to as a surrogate model, on the raw data and the complex +model’s predictions. The surrogate model’s corresponding interpretation +can be used as a proxy for understanding the complex model.

                      +

                      This technique is quite simple and fairly general to apply. In practice, +however, two concerns manifest.

                      +
                        +
                      1. +

                        If the surrogate itself performs well on the predictions, why not +try to directly apply it rather than the more complex model?

                        +
                      2. +
                      3. +

                        If it doesn’t perform well, how do we know that it genuinely +represents the complex model’s behavior?

                        +
                      4. +
                      +

                      +

                      Another category of surrogate models is local surrogate +models +(LIME). Rather than apply the surrogate model in a global context on +all the data, LIME models focus on a single point to generate an +explanation for. A perturbation is applied to the point, resulting in a +local neighborhood of perturbed data points. On top of these perturbed +data points, a surrogate model is trained to map the points to the +original predictions from the complex model. If the surrogate model +classifies similarly to the complex model, the surrogate can be +considered a proxy for interpretation purposes. This method is used +widely, as it works for all data types (including images and text). +However, defining the right perturbations and ensuring the stability of +the explanations is challenging.

                      +

                      Understand The Contribution of Features To The Prediction

                      +

                      Better understanding each feature’s role in making a prediction is +another option for interpretable and explainable ML. Data +visualization is one such option, with plots like partial +dependence +plots +and individual conditional +expectation +plots.

                      +

                      +

                      A numerical method is permutation feature importance, which selects +a feature, randomizes its order in the dataset, and sees how that +affects performance. While this method is very easy and widely used, it +doesn’t work for high-dimensional data or cases where there is feature +interdependence.

                      +

                      A more principled approach to explaining the contribution of individual +features is SHAP (Shapley Additive +Explanations). +At a high level, SHAP scores test how much changes in a single feature +impact the output of a classifier when controlling for the values of the +other features. This is a reliable method to apply, as it works on a +variety of data and is mathematically principled. However, it can be +tricky to implement and doesn’t provide explanations.

                      +

                      Gradient-based saliency maps are a popular method for explanations +and interpretations. This intuitive method selects an input, performs a +forward pass, computes the gradient with respect to the pixels, and +visualizes the gradients. Essentially, how much does a unit change in +the value of the input’s pixels affect the prediction of the model? This +is a straightforward and common method. Similar to the challenge with +attention, the explanations may not be correct, and the overall method +is fragile and sensitive to small changes.

                      +

                      Understand The Contribution of Training Data Points To The Prediction

                      +

                      +

                      Instead of focusing on features and their explicit relevance to the +prediction, we can also take a hard look at the training data points +themselves.

                      +
                        +
                      • +

                        Prototypes and criticisms are one such approach, though it is +less applicable to deep learning. In this method, prototypes are +clusters of data that explain much of the variance in the model. +Criticisms are data points not explained by the prototypes.

                        +
                      • +
                      • +

                        Another approach is to look specifically at “influential +instances” or data points that cause major changes in the +model’s predictions when removed from the data set.

                        +
                      • +
                      +

                      Do You Need "Explainability"?

                      +

                      A good question to ask yourself whether or not “explainable AI” is a +real need for your applications. There are a couple of cases where this +question can be useful:

                      +
                        +
                      1. +

                        Regulators demand it. In this case, there’s not much you can do +besides produce some kind of explainable model. However, it can be +helpful to ask for clarification on what explainability is +judged as.

                        +
                      2. +
                      3. +

                        Users demand it. In some cases, users themselves may want trust +or explainability in the system. Investigate the necessity for the +explainability and trust to come directly from the model itself. +Can good product design inspire trust more effectively? For +example, allowing doctors to simply override models can reduce the +immediate need for explainability. A big associated concern is how +often users interact with the model. Infrequent interactions +likely require explainable AI, as humans do not get a chance to +build their feel for the system. More frequent interactions allow +for the simpler objective of interpretability.

                        +
                      4. +
                      5. +

                        Deployment demands it. Sometimes, ML stakeholders may demand +explainability as a component of ensuring confidence in ML system +deployment. In this context, explainability is the wrong +objective; domain predictability is the real aim. Rather than +full-on explainability, interpretability can be helpful for +deployment, especially visualizations for debugging.

                        +
                      6. +
                      +

                      At present, true explainability for deep learning models is not +possible.

                      +
                        +
                      • +

                        Current explanation methods are not faithful to the original model +performance; it can be easy to cherry-pick specific examples that +can overstate explainability.

                        +
                      • +
                      • +

                        Furthermore, these methods tend to be unreliable and highly +sensitive to the input.

                        +
                      • +
                      • +

                        Finally, as described in the attention section, the full explanation +is often not available to modern explainability methods.

                        +
                      • +
                      +

                      Because of these reasons, explainability is not practically feasible for +deep learning models (as of 2021). Read Cynthia Rudin’s 2019 +paper for more detail.

                      +

                      Caveats For Explainable and Interpretable AI

                      +
                        +
                      • +

                        If you genuinely need to explain your model’s predictions, use an +interpretable model family (read more +here).

                        +
                      • +
                      • +

                        Don’t try to force-fit deep learning explainability methods; they +produce cool results but are not reliable enough for production +use cases.

                        +
                      • +
                      • +

                        Specific interpretability methods like LIME and SHAP are +instrumental in helping users reach interpretability thresholds +faster.

                        +
                      • +
                      • +

                        Finally, the visualization for interpretability can be pretty +useful for debugging.

                        +
                      • +
                      +

                      5 - Resources

                      + +

                      [1] https://vwo.com/ab-testing-2/

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-11-notes-media/image1.png b/spring2021/lecture-11-notes-media/image1.png new file mode 100644 index 00000000..8f876850 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image1.png differ diff --git a/spring2021/lecture-11-notes-media/image10.png b/spring2021/lecture-11-notes-media/image10.png new file mode 100644 index 00000000..79f9149d Binary files /dev/null and b/spring2021/lecture-11-notes-media/image10.png differ diff --git a/spring2021/lecture-11-notes-media/image11.png b/spring2021/lecture-11-notes-media/image11.png new file mode 100644 index 00000000..5ef8987f Binary files /dev/null and b/spring2021/lecture-11-notes-media/image11.png differ diff --git a/spring2021/lecture-11-notes-media/image12.png b/spring2021/lecture-11-notes-media/image12.png new file mode 100644 index 00000000..0750ece7 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image12.png differ diff --git a/spring2021/lecture-11-notes-media/image13.png b/spring2021/lecture-11-notes-media/image13.png new file mode 100644 index 00000000..290a117a Binary files /dev/null and b/spring2021/lecture-11-notes-media/image13.png differ diff --git a/spring2021/lecture-11-notes-media/image14.png b/spring2021/lecture-11-notes-media/image14.png new file mode 100644 index 00000000..cca3247d Binary files /dev/null and b/spring2021/lecture-11-notes-media/image14.png differ diff --git a/spring2021/lecture-11-notes-media/image15.png b/spring2021/lecture-11-notes-media/image15.png new file mode 100644 index 00000000..afac1296 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image15.png differ diff --git a/spring2021/lecture-11-notes-media/image16.png b/spring2021/lecture-11-notes-media/image16.png new file mode 100644 index 00000000..8e9ce46f Binary files /dev/null and b/spring2021/lecture-11-notes-media/image16.png differ diff --git a/spring2021/lecture-11-notes-media/image17.png b/spring2021/lecture-11-notes-media/image17.png new file mode 100644 index 00000000..0fdc836e Binary files /dev/null and b/spring2021/lecture-11-notes-media/image17.png differ diff --git a/spring2021/lecture-11-notes-media/image18.png b/spring2021/lecture-11-notes-media/image18.png new file mode 100644 index 00000000..6119c6f1 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image18.png differ diff --git a/spring2021/lecture-11-notes-media/image19.png b/spring2021/lecture-11-notes-media/image19.png new file mode 100644 index 00000000..3044f4a1 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image19.png differ diff --git a/spring2021/lecture-11-notes-media/image2.png b/spring2021/lecture-11-notes-media/image2.png new file mode 100644 index 00000000..9827066c Binary files /dev/null and b/spring2021/lecture-11-notes-media/image2.png differ diff --git a/spring2021/lecture-11-notes-media/image20.png b/spring2021/lecture-11-notes-media/image20.png new file mode 100644 index 00000000..30f04abd Binary files /dev/null and b/spring2021/lecture-11-notes-media/image20.png differ diff --git a/spring2021/lecture-11-notes-media/image21.png b/spring2021/lecture-11-notes-media/image21.png new file mode 100644 index 00000000..f8c69f18 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image21.png differ diff --git a/spring2021/lecture-11-notes-media/image22.png b/spring2021/lecture-11-notes-media/image22.png new file mode 100644 index 00000000..9e572513 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image22.png differ diff --git a/spring2021/lecture-11-notes-media/image3.png b/spring2021/lecture-11-notes-media/image3.png new file mode 100644 index 00000000..90586f19 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image3.png differ diff --git a/spring2021/lecture-11-notes-media/image4.png b/spring2021/lecture-11-notes-media/image4.png new file mode 100644 index 00000000..62578ca3 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image4.png differ diff --git a/spring2021/lecture-11-notes-media/image5.png b/spring2021/lecture-11-notes-media/image5.png new file mode 100644 index 00000000..9a4debfe Binary files /dev/null and b/spring2021/lecture-11-notes-media/image5.png differ diff --git a/spring2021/lecture-11-notes-media/image6.png b/spring2021/lecture-11-notes-media/image6.png new file mode 100644 index 00000000..e7c0bfdc Binary files /dev/null and b/spring2021/lecture-11-notes-media/image6.png differ diff --git a/spring2021/lecture-11-notes-media/image7.png b/spring2021/lecture-11-notes-media/image7.png new file mode 100644 index 00000000..65c535dd Binary files /dev/null and b/spring2021/lecture-11-notes-media/image7.png differ diff --git a/spring2021/lecture-11-notes-media/image8.png b/spring2021/lecture-11-notes-media/image8.png new file mode 100644 index 00000000..c03fc728 Binary files /dev/null and b/spring2021/lecture-11-notes-media/image8.png differ diff --git a/spring2021/lecture-11-notes-media/image9.png b/spring2021/lecture-11-notes-media/image9.png new file mode 100644 index 00000000..27257a8f Binary files /dev/null and b/spring2021/lecture-11-notes-media/image9.png differ diff --git a/spring2021/lecture-11/index.html b/spring2021/lecture-11/index.html new file mode 100644 index 00000000..1cf7e9af --- /dev/null +++ b/spring2021/lecture-11/index.html @@ -0,0 +1,4074 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 11: Deployment & Monitoring - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lecture 11: Deployment & Monitoring

                      +

                      Video

                      +

                      Deployment:

                      + + +

                      Monitoring:

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Download notes as PDF

                      +

                      Lecture by Josh Tobin. +Notes transcribed by James Le +and Vishnu Rachakonda.

                      +

                      ML in production scales to meet users’ demands by delivering thousands +to millions of predictions per second. On the other hand, models in +notebooks only work if you run the cells in the right order. To be +frank, most data scientists and ML engineers do not know how to build +production ML systems. Therefore, the goal of this lecture is to give +you different flavors of accomplishing that task.

                      +

                      I - Model Deployment

                      +

                      1 - Types of Deployment

                      +

                      One way to conceptualize different approaches to deploy ML models is to +think about where to deploy them in your application’s overall +architecture.

                      +
                        +
                      • +

                        The client-side runs locally on the user machine (web browser, +mobile devices, etc..)

                        +
                      • +
                      • +

                        It connects to the server-side that runs your code remotely.

                        +
                      • +
                      • +

                        The server connects with a database to pull data out, render the +data, and show the data to the user.

                        +
                      • +
                      +
                      Batch Prediction
                      +

                      +

                      Batch prediction means that you train the models offline, dump the +results into a database, then run the rest of the application normally. +You periodically run your model on new data coming in and cache the +results in a database. Batch prediction is commonly used in production +when the universe of inputs is relatively small (e.g., one prediction +per user per day).

                      +

                      The pros of batch prediction:

                      +
                        +
                      • +

                        It is simple to implement.

                        +
                      • +
                      • +

                        It requires relatively low latency to the user.

                        +
                      • +
                      +

                      The cons of batch prediction:

                      +
                        +
                      • +

                        It does not scale to complex input types.

                        +
                      • +
                      • +

                        Users do not get the most up-to-date predictions.

                        +
                      • +
                      • +

                        Models frequently become “stale” and hard to detect.

                        +
                      • +
                      +
                      Model-In-Service
                      +

                      +

                      Model-in-service means that you package up your model and include it in +the deployed web server. Then, the web server loads the model and calls +it to make predictions.

                      +

                      The pros of model-in-service prediction:

                      +
                        +
                      • It reuses your existing infrastructure.
                      • +
                      +

                      The cons of model-in-service prediction:

                      +
                        +
                      • +

                        The web server may be written in a different language.

                        +
                      • +
                      • +

                        Models may change more frequently than the server code.

                        +
                      • +
                      • +

                        Large models can eat into the resources for your webserver.

                        +
                      • +
                      • +

                        Server hardware is not optimized for your model (e.g., no GPUs).

                        +
                      • +
                      • +

                        Model and server may scale differently.

                        +
                      • +
                      +
                      Model-As-Service
                      +

                      +

                      Model-as-service means that you deploy the model separately as its own +service. The client and server can interact with the model by making +requests to the model service and receiving responses.

                      +

                      The pros of model-as-service prediction:

                      +
                        +
                      • +

                        It is dependable, as model bugs are less likely to crash the web +app.

                        +
                      • +
                      • +

                        It is scalable, as you can choose the optimal hardware for the model +and scale it appropriately.

                        +
                      • +
                      • +

                        It is flexible, as you can easily reuse the model across multiple +applications.

                        +
                      • +
                      +

                      The cons of model-as-service prediction:

                      +
                        +
                      • +

                        It adds latency.

                        +
                      • +
                      • +

                        It adds infrastructural complexity.

                        +
                      • +
                      • +

                        Most importantly, you are now on the hook to run a model service...

                        +
                      • +
                      +

                      2 - Building A Model Service

                      +
                      REST APIs
                      +

                      REST APIs +represent a way of serving predictions in response to canonically +formatted HTTP requests. There are alternatives such as +gRPC and +GraphQL. For instance, in your command +line, you can use curl to post some data to an URL and get back JSON +that contains the model predictions.

                      +

                      Sadly, there is no standard way of formatting the data that goes into an +ML model.

                      +
                      Dependency Management
                      +

                      Model predictions depend on the code, the model weights, and the +code dependencies. All three need to be present on your webserver. +For code and model weights, you can simply copy them locally (or write a +script to extract them if they are large). But dependencies are trickier +because they cause troubles. As they are hard to make consistent and +update, your model behavior might change accordingly.

                      +

                      There are two high-level strategies to manage code dependencies:

                      +
                        +
                      1. +

                        You constrain the dependencies of your model.

                        +
                      2. +
                      3. +

                        You use containers.

                        +
                      4. +
                      +
                      ONNX
                      +

                      +

                      If you go with the first strategy, you need a standard neural network +format. The Open Neural Network Exchange +(ONNX, for short) is designed to allow framework interoperability. The +dream is to mix different frameworks, such that frameworks that are good +for development (PyTorch) don’t also have to be good at inference +(Caffe2).

                      +
                        +
                      • +

                        The promise is that you can train a model with one tool stack and +then deploy it using another for inference/prediction. ONNX is a +robust and open standard for preventing framework lock-in and +ensuring that your models will be usable in the long run.

                        +
                      • +
                      • +

                        The reality is that since ML libraries change quickly, there are +often bugs in the translation layer. Furthermore, how do you deal +with non-library code (like feature transformations)?

                        +
                      • +
                      +
                      Docker
                      +

                      If you go with the second strategy, you want to learn Docker. +Docker is a computer program that +performs operating-system-level virtualization, also known as +containerization. What is a container, you might ask? It is a +standardized unit of fully packaged software used for local development, +shipping code, and deploying system.

                      +

                      The best way to describe it intuitively is to think of a process +surrounded by its filesystem. You run one or a few related processes, +and they see a whole filesystem, not shared by anyone.

                      +
                        +
                      • +

                        This makes containers extremely portable, as they are detached +from the underlying hardware and the platform that runs them.

                        +
                      • +
                      • +

                        They are very lightweight, as a minimal amount of data needs to +be included.

                        +
                      • +
                      • +

                        They are secure, as the exposed attack surface of a container is +extremely small.

                        +
                      • +
                      +

                      Note here that containers are different from virtual machines.

                      +
                        +
                      • +

                        Virtual machines require the hypervisor to virtualize a full +hardware stack. There are also multiple guest operating systems, +making them larger and more extended to boot. This is what AWS / +GCP / Azure cloud instances are.

                        +
                      • +
                      • +

                        Containers, on the other hand, require no hypervisor/hardware +virtualization. All containers share the same host kernel. There +are dedicated isolated user-space environments, making them much +smaller in size and faster to boot.

                        +
                      • +
                      +

                      +

                      In brief, you should familiarize yourself with these basic concepts:

                      +
                        +
                      1. +

                        Dockerfile defines how to build an image.

                        +
                      2. +
                      3. +

                        Image is a built packaged environment.

                        +
                      4. +
                      5. +

                        Container is where images are run inside.

                        +
                      6. +
                      7. +

                        Repository hosts different versions of an image.

                        +
                      8. +
                      9. +

                        Registry is a set of repositories.

                        +
                      10. +
                      +

                      Furthermore, Docker has a robust ecosystem. It has the +DockerHub for community-contributed +images. It’s incredibly easy to search for images that meet your needs, +ready to pull down and use with little-to-no modification.

                      +

                      Though Docker presents how to deal with each of the individual +microservices, we also need an orchestrator to handle the whole +cluster of services. Such an orchestrator distributes containers onto +the underlying virtual machines or bare metal so that these containers +talk to each other and coordinate to solve the task at hand. The +standard container orchestration tool is +Kubernetes.

                      +
                      Performance Optimization
                      +

                      We will talk mostly about how to run your model service faster on a +single machine. Here are the key questions that you want to address:

                      +
                        +
                      • +

                        Do you want inference on a GPU or not?

                        +
                      • +
                      • +

                        How can you run multiple copies of the model at the same time?

                        +
                      • +
                      • +

                        How to make the model smaller?

                        +
                      • +
                      • +

                        How to improve model performance via caching, batching, and GPU +sharing?

                        +
                      • +
                      +
                      GPU or no GPU?
                      +

                      Here are the pros of GPU inference:

                      +
                        +
                      • +

                        You use the same hardware that your model is trained on probably.

                        +
                      • +
                      • +

                        If your model gets bigger and you want to limit model size or tune +batch size, you will get high throughput.

                        +
                      • +
                      +

                      Here are the cons of GPU inference:

                      +
                        +
                      • +

                        GPU is complex to set up.

                        +
                      • +
                      • +

                        GPUs are expensive.

                        +
                      • +
                      +
                      Concurrency
                      +

                      Instead of running a single model copy on your machine, you run multiple +model copies on different CPUs or cores. In practice, you need to be +careful about thread tuning - making sure that each model copy only +uses the minimum number of threads required. Read this blog post +from +Roblox +for the details.

                      +
                      Model distillation
                      +

                      +

                      Model distillation is a compression technique in which a small “student” +model is trained to reproduce the behavior of a large “teacher” model. +The method was first proposed by Bucila et al., +2006 and +generalized by Hinton et al., +2015. In distillation, +knowledge is transferred from the teacher model to the student by +minimizing a loss function. The target is the distribution of class +probabilities predicted by the teacher model. That is — the output of a +softmax function on the teacher model’s logits.

                      +

                      Distillation can be finicky to do yourself, so it is infrequently used +in practice. Read this blog post from Derrick +Mwiti +for several model distillation techniques for deep learning.

                      +
                      Model quantization
                      +

                      +

                      Model quantization is a model compression technique that makes the model +physically smaller to save disk space and require less memory during +computation to run faster. It decreases the numerical precision of a +model’s weights. In other words, each weight is permanently encoded +using fewer bits. Note here that there are tradeoffs with accuracy.

                      +
                        +
                      • +

                        A straightforward method is implemented in the TensorFlow Lite +toolkit. +It turns a matrix of 32-bit floats into 8-bit integers by applying +a simple “center-and-scale” transform to it: W_8 = W_32 / +scale + shift (scale and shift are determined individually for +each weight matrix). This way, the 8-bit W is used in matrix +multiplication, and only the result is then corrected by applying +the “center-and-scale” operation in reverse.

                        +
                      • +
                      • +

                        PyTorch also has quantization +built-in +that includes three techniques: dynamic quantization, +post-training static quantization, and quantization-aware +training.

                        +
                      • +
                      +
                      Caching
                      +

                      For many ML models, the input distribution is non-uniform (some are more +common than others). Caching takes advantage of that. Instead of +constantly calling the model on every input no matter what, we first +cache the model’s frequently-used inputs. Before calling the model, +we check the cache and only call it on the frequently-used inputs.

                      +

                      Caching techniques can get very fancy, but the most basic way to get +started is using Python’s +functools.

                      +

                      +
                      Batching
                      +

                      Typically, ML models achieve higher throughput when making predictions +in parallel (especially true for GPU inference). At a high level, here’s +how batching works:

                      +
                        +
                      • +

                        You gather predictions that are coming in until you have a batch for +your system. Then, you run the model on that batch and return +predictions to those users who request them.

                        +
                      • +
                      • +

                        You need to tune the batch size and address the tradeoff between +throughput and latency.

                        +
                      • +
                      • +

                        You need to have a way to shortcut the process if latency becomes +too long.

                        +
                      • +
                      • +

                        The last caveat is that you probably do not want to implement +batching yourself.

                        +
                      • +
                      +
                      Sharing The GPU
                      +

                      Your model may not take up all of the GPU memory with your inference +batch size. Why not run multiple models on the same GPU? You +probably want to use a model serving solution that supports this out of +the box.

                      +
                      Model Serving Libraries
                      +

                      There are canonical open-source model serving libraries for both PyTorch +(TorchServe) and TensorFlow +(TensorFlow +Serving). Ray +Serve is another +promising choice. Even NVIDIA has joined the game with Triton +Inference +Server.

                      +

                      +
                      Horizontal Scaling
                      +

                      If you have too much traffic for a single machine, let’s split traffic +among multiple machines. At a high level, you duplicate your +prediction service, use a load balancer to split traffic, and send the +traffic to the appropriate copy of your service. In practice, there are +two common methods:

                      +
                        +
                      1. +

                        Use a container orchestration toolkit like Kubernetes.

                        +
                      2. +
                      3. +

                        Use a serverless option like AWS Lambda.

                        +
                      4. +
                      +
                      Container Orchestration
                      +

                      +

                      In this paradigm, your Docker containers are coordinated by Kubernetes. +K8s provides a single service for you to send requests to. Then it +divides up traffic that gets sent to that service to virtual copies of +your containers (that are running on your infrastructure).

                      +

                      You can build a system like this yourself on top of K8s if you want to. +But there are emerging frameworks that can handle all such +infrastructure out of the box if you have a K8s cluster running. +KFServing +is a part of the Kubeflow package, a +popular K8s-native ML infrastructure solution. +Seldon provides a model serving stack on top +of K8s.

                      +
                      Deploying Code As Serverless Functions
                      +

                      The idea here is that the app code and dependencies are packaged into +.zip files (or Docker containers) with a single entry point function. +All the major cloud providers such as AWS Lambda, Google Cloud +Functions, or Azure Functions will manage everything else: instant +scaling to 10,000+ requests per second, load balancing, etc.

                      +

                      +

                      The good thing is that you only pay for compute-time. Furthermore, +this approach lowers your DevOps load, as you do not own any servers.

                      +

                      The tradeoff is that you have to work with severe constraints:

                      +
                        +
                      1. +

                        Your entire deployment package is quite limited.

                        +
                      2. +
                      3. +

                        You can only do CPU execution.

                        +
                      4. +
                      5. +

                        It can be challenging to build model pipelines.

                        +
                      6. +
                      7. +

                        There are limited state management and deployment tooling.

                        +
                      8. +
                      +
                      Model Deployment
                      +

                      If serving is how you turn a model into something that can respond to +requests, deployment is how you roll out, manage, and update these +services. You probably want to be able to roll out gradually, roll +back instantly, and deploy pipelines of models. Many challenging +infrastructure considerations go into this, but hopefully, your +deployment library will take care of this for you.

                      +
                      Managed Options
                      +

                      +

                      If you do not want to deal with any of the things mentioned thus far, +there are managed options in the market. All major cloud providers have +ones that enable you to package your model in a predefined way and turn +it into an API. Startups like +Algorithmia and +Cortex are some alternatives. The big +drawback is that pricing tends to be high, so you pay a premium fee in +exchange for convenience.

                      +
                      Takeaways
                      +
                        +
                      • +

                        If you are making CPU inference, you can get away with scaling by +launching more servers or going serverless.

                        +
                      • +
                      • +

                        Serverless makes sense if you can get away with CPUs, and traffic is +spiky or low-volume.

                        +
                      • +
                      • +

                        If you are using GPU inference, serving tools will save you time.

                        +
                      • +
                      • +

                        It’s worth keeping an eye on startups in this space for GPU +inference.

                        +
                      • +
                      +

                      3 - Edge Deployment

                      +

                      +

                      Edge prediction means that you first send the model weights to the +client edge device. Then, the client loads the model and interacts with +it directly.

                      +

                      The pros of edge prediction:

                      +
                        +
                      • +

                        It has low latency.

                        +
                      • +
                      • +

                        It does not require an Internet connection.

                        +
                      • +
                      • +

                        It satisfies data security requirements, as data does not need to +leave the user’s device.

                        +
                      • +
                      +

                      The cons of edge prediction:

                      +
                        +
                      • +

                        The client often has limited hardware resources available.

                        +
                      • +
                      • +

                        Embedded and mobile frameworks are less full-featured than +TensorFlow and PyTorch.

                        +
                      • +
                      • +

                        It is challenging to update models.

                        +
                      • +
                      • +

                        It is difficult to monitor and debug when things go wrong.

                        +
                      • +
                      +
                      Tools For Edge Deployment
                      +

                      TensorRT is NVIDIA’s +framework meant to help you optimize models for inference on NVIDIA +devices in data centers and embedded/automotive environments. TensorRT +is also integrated with application-specific SDKs to provide developers +a unified path to deploy conversational AI, recommender, video +conference, and streaming apps in production.

                      +

                      +

                      ApacheTVM is an open-source machine +learning compiler framework for CPUs, GPUs, and ML accelerators. It aims +to enable ML engineers to optimize and run computations efficiently on +any hardware backend. In particular, it compiles ML models into minimum +deployable modules and provides the infrastructure to automatically +optimize models on more backends with better performance.

                      +

                      Tensorflow Lite provides a +trained TensorFlow model framework to be compressed and deployed to a +mobile or embedded application. TensorFlow’s computationally expensive +training process can still be performed in the environment that best +suits it (personal server, cloud, overclocked computer). TensorFlow Lite +then takes the resulting model (frozen graph, SavedModel, or HDF5 model) +as input, packages, deploys, and then interprets it in the client +application, handling the resource-conserving optimizations along the +way.

                      +

                      +

                      PyTorch Mobile is a framework +for helping mobile developers and machine learning engineers embed +PyTorch models on-device. Currently, it allows any TorchScript +model +to run directly inside iOS and Android applications. PyTorch Mobile’s +initial release supports many different quantization techniques, which +shrink model sizes without significantly affecting performance. PyTorch +Mobile also allows developers to directly convert a PyTorch model to a +mobile-ready format without needing to work through other +tools/frameworks.

                      +

                      JavaScript is a portable way of running code on different devices. +Tensorflow.js enables you to run +TensorFlow code in JavaScript. You can use off-the-shelf JavaScript +models or convert Python TensorFlow models to run in the browser or +under Node.js, retrain pre-existing ML models using your data, and +build/train models directly in JavaScript using flexible and intuitive +APIs.

                      +

                      Core ML was +released by Apple back in 2017. It is optimized for on-device +performance, which minimizes a model’s memory footprint and power +consumption. Running strictly on the device also ensures that user data +is kept secure. The app runs even in the absence of a network +connection. Generally speaking, it is straightforward to use with just a +few lines of code needed to integrate a complete ML model into your +device. The downside is that you can only make the model inference, as +no model training is possible.

                      +

                      +

                      ML Kit was announced +by Google Firebase in 2018. It enables developers to utilize ML in +mobile apps either with (1) inference in the cloud via API or (2) +inference on-device (like Core ML). For the former option, ML Kit offers +six base APIs with pertained models such as Image Labeling, Text +Recognition, and Barcode Scanning. For the latter option, ML Kit offers +lower accuracy but more security to user data, compared to the cloud +version.

                      +

                      If you are interested in either of the above options, check out this +comparison +by the FritzAI team. Additionally, +FritzAI is an ML platform for +mobile developers that provide pre-trained models, developer tools, and +SDKs for iOS, Android, and Unity.

                      +
                      More Efficient Models
                      +

                      +

                      Another thing to consider for edge deployment is to make the models more +efficient. One way to do this is to use the same quantization and +distillation techniques discussed above. Another way is to pick +mobile-friendly model architectures. The first successful example is +MobileNet, +which performs various downsampling techniques to a traditional ConvNet +architecture to maximize accuracy while being mindful of the restricted +resources for a mobile or an embedded device. This +analysis +by Yusuke Uchida explains why MobileNet and its variants are fast.

                      +

                      A well-known case study of applying knowledge distillation in practice +is Hugging Face’s +DistilBERT, +a smaller language model derived from the supervision of the popular +BERT language model. +DistilBERT removes the toke-type embeddings and the pooler (used for the +next sentence classification task) from BERT while keeping the rest of +the architecture identical and reducing the number of layers by a factor +of two. Overall, DistilBERT has about half the total number of +parameters of the BERT base and retains 95% of BERT’s performances on +the language understanding benchmark GLUE.

                      +
                      Mindset For Edge Deployment
                      +
                        +
                      • +

                        It is crucial to choose your architecture with your target +hardware in mind. Specifically, you can make up a factor of 2-10 +through distillation, quantization, and other tricks (but not more +than that).

                        +
                      • +
                      • +

                        Once you have a model that works on your edge device, you can +iterate locally as long as you add model size and latency to +your metrics and avoid regression.

                        +
                      • +
                      • +

                        You should treat tuning the model for your device as an additional +risk in the deployment cycle and test it accordingly. In other +words, you should always test your models on production hardware +before deploying them for real.

                        +
                      • +
                      • +

                        Since models can be finicky, it’s a good idea to build fallback +mechanisms into the application if the model fails or is too +slow.

                        +
                      • +
                      +
                      Takeaways
                      +
                        +
                      • +

                        Web deployment is easier, so only perform edge deployment if you +need to.

                        +
                      • +
                      • +

                        You should choose your framework to match the available hardware and +corresponding mobile frameworks. Else, you can try Apache TVM to +be more flexible.

                        +
                      • +
                      • +

                        You should start considering hardware constraints at the beginning +of the project and choose the architectures accordingly.

                        +
                      • +
                      +

                      II - Model Monitoring

                      +

                      Once you deploy models, how do you make sure they are staying healthy +and working well? Enter model monitoring.

                      +

                      Many things can go wrong with a model once it’s been trained. This can +happen even if your model has been trained properly, with a reasonable +validation and test loss, as well as robust performance across various +slices and quality predictions. Even after you’ve troubleshot and tested +a model, things can still go wrong!

                      +

                      1 - Why Model Degrades Post-Deployment?

                      +

                      Model performance tends to degrade after you’ve deployed a model. Why +does this occur? In supervised learning, we seek to fit a function f to +approximate a posterior using the data available to us. If any component +of this process changes (i.e., the data x), the deployed model can see +an unexpectedly degraded performance. See the below chart for examples +of how such post-deployment degradations can occur theoretically and in +practice:

                      +

                      +

                      In summary, there are three core ways that the model’s performance can +degrade: data drift, concept drift, and domain shift.

                      +
                        +
                      1. +

                        In data drift, the underlying data expectation that your model is +built can unexpectedly change, perhaps through a bug in the +upstream data pipeline or even due to malicious users feeding the +model bad data.

                        +
                      2. +
                      3. +

                        In concept drift, the actual outcome you seek to model, or the +relationship between the data and the outcome, may fray. For +example, users may start to pick movies in a different manner +based on the output of your model, thereby changing the +fundamental “concept” the model needs to approximate.

                        +
                      4. +
                      5. +

                        Finally, in domain shift, if your dataset does not appropriately +sample the production, post-deployment setting, the model’s +performance may suffer; this could be considered a “long tail” +scenario, where many rare examples that are not present in the +development data occur.

                        +
                      6. +
                      +

                      2 - Data Drift

                      +

                      There are a few different types of data drift:

                      +
                        +
                      • +

                        Instantaneous drift: In this situation, the paradigm of the +draft dramatically shifts. Examples are deploying the model in a +new domain (e.g., self-driving car model in a new city), a bug in +the preprocessing pipeline, or even major external shifts like +COVID.

                        +
                      • +
                      • +

                        Gradual drift: In this situation, the value of data gradually +changes with time. For example, users’ preferences may change over +time, or new concepts can get introduced to the domain.

                        +
                      • +
                      • +

                        Periodic drift: Data can have fluctuating value due to +underlying patterns like seasonality or time zones.

                        +
                      • +
                      • +

                        Temporary drift: The most difficult to detect, drift can occur +through a short-term change in the data that shifts back to +normal. This could be via a short-lived malicious attack, or even +simply because a user with different demographics or behaviors +uses your product in a way that it’s not designed to be used.

                        +
                      • +
                      +

                      While these categories may seem like purely academic categories, the +consequences of data shift are very +real. +This is a real problem that affects many companies and is only now +starting to get the attention it merits.

                      +

                      3 - What Should You Monitor?

                      +

                      There are four core types of signals to monitor for machine learning +models.

                      +

                      +

                      These metrics trade off with another in terms of how informative they +are and how easy they are to access. Put simply, the harder a metric +may be to monitor, the more useful it likely is.

                      +
                        +
                      • +

                        The hardest and best metrics to monitor are model performance +metrics, though these can be difficult to acquire in real-time +(labels are hard to come by).

                        +
                      • +
                      • +

                        Business metrics can be helpful signals of model degradation in +monitoring but can easily be confounded by other impactful +considerations.

                        +
                      • +
                      • +

                        Model inputs and predictions are a simple way to identify +high-level drift and are very easy to gather. Still, they can be +difficult to assess in terms of actual performance impact, leaving +it more of an art than science.

                        +
                      • +
                      • +

                        Finally, system performance (e.g., GPU usage) can be a coarse +method of catching serious bugs.

                        +
                      • +
                      +

                      In considering which metrics to focus on, prioritize ground-truth +metrics (model and business metrics), then approximate performance +metrics (business and input/outputs), and finally, system health +metrics.

                      +

                      4 - How Do You Measure Distribution Changes?

                      +
                      Select A Reference Window
                      +

                      To measure distribution changes in metrics you’re monitoring, start by +picking a reference set of production data to compare new data to. There +are a few different ways of picking this reference data (e.g., +sliding window or fixed +window of production data), but the most practical thing to do is to +use your training or evaluation data as the reference. Data coming in +looking different from what you developed your model using is an +important signal to act on.

                      +
                      Select A Measurement Window
                      +

                      After picking a reference window, the next step is to choose a +measurement window to compare, measure distance, and evaluate for drift. +The challenge is that selecting a measurement window is highly +problem-dependent. One solution is to pick one or several window sizes +and slide them over the data. To avoid recomputing metrics over and +over again, when you slide the window, it’s worth looking into the +literature on mergeable (quantile) sketching +algorithms.

                      +
                      Compare Windows Using A Distance Metric
                      +

                      What distance metrics should we use to compare the reference window to +the measurement window? Some 1-D metric categories are:

                      +
                        +
                      1. +

                        Rule-based distance metrics (e.g., data quality): Summary +statistics, the volume of data points, number of missing values, +or more complex tests like overall comparisons are common data +quality checks that can be applied. Great +Expectations is a valuable +library for this. Definitely invest in simple rule-based +metrics. They catch a large number of bugs, as publications from +Amazon and Google detail.

                        +
                      2. +
                      3. +

                        Statistical distance metrics (e.g., KS statistics, KL +divergence, D_1 distance, etc.)

                        +
                          +
                        1. +

                          KL +Divergence: +Defined as the expectation of a ratio of logs of two different +distributions, this commonly known metric is very sensitive to +what happens in the tails of the distribution. It’s not +well-suited to data shift testing since it’s easily disturbed, +is not interpretable, and struggles with data in different +ranges.

                          +
                        2. +
                        3. +

                          KS +Statistic: +This metric is defined as the max distance between CDFs, which +is easy to interpret and is thus used widely in practice. Say +yes to the KS statistic!

                          +
                        4. +
                        5. +

                          D1 +Distance: +Defined as the sum of distances between PDFs, this is a metric +used at Google. Despite seeming less principled, it’s easily +interpretable and has the added benefit of knowing Google uses +it (so why not you?).

                          +
                        6. +
                        +
                      4. +
                      +

                      +

                      An open area of research is understanding the impact of differing drift +patterns on distance metrics and model performance. Another open area of +research is high-dimensional distance metrics. Some options here are:

                      +
                        +
                      1. +

                        Maximum mean +discrepancy

                        +
                      2. +
                      3. +

                        Performing multiple 1D comparisons across the data: While suffering +from the multiple hypothesis testing +problem, +this is a practical approach.

                        +
                      4. +
                      5. +

                        Prioritize some features for 1D comparisons: Another option is to +avoid testing all the features and only focus on those that merit +comparison; for example, those features you know may have shifted +in the data.

                        +
                      6. +
                      7. +

                        Projections: In this +approach, large data points are put through a dimensionality +reduction process and then subject to a two-sample statistical +test. Reducing the dimensionality with a domain-specific approach +(e.g., mean pixel value for images, length of sentence) is +recommended.

                        +
                      8. +
                      +

                      At a high level, this entire distance metric work aims to identify not +just a score for any data shift but also understand its impact on the +model. While choosing a metric can be complicated with all the possible +options, you should focus on understanding your model’s robustness in a +post-deployment scenario.

                      +

                      5 - How Do You Tell If A Change Is Bad?

                      +

                      There’s no hard and fast rule for finding if a change in the data is +bad. An easy option is to set thresholds on the test values. Don’t use a +statistical test like the KS test, as they are too sensitive to small +shifts. Other +options +include setting manual ranges, comparing values over time, or even +applying an unsupervised model to detect outliers. In practice, fixed +rules and specified ranges of test values are used most in practice.

                      +

                      +

                      6 - Tools For Monitoring

                      +

                      There are three categories of tools useful for monitoring:

                      +
                        +
                      1. +

                        System monitoring tools like AWS +CloudWatch, +Datadog, New +Relic, and +honeycomb test traditional +performance metrics

                        +
                      2. +
                      3. +

                        Data quality tools like Great +Expectations, +Anomalo, and Monte +Carlo test if specific +windows of data violate rules or assumptions.

                        +
                      4. +
                      5. +

                        ML monitoring tools like Arize, +Fiddler, and +Arthur can also be useful, as +they specifically test models.

                        +
                      6. +
                      +

                      7 - Evaluation Store

                      +

                      Monitoring is more central to ML than for traditional software.

                      +
                        +
                      • +

                        In traditional SWE, most bugs cause loud failures, and the data that +is monitored is most valuable to detect and diagnose problems. If +the system is working well, the data from these metrics and +monitoring systems may not be useful.

                        +
                      • +
                      • +

                        In machine learning, however, monitoring plays a different role. +First off, bugs in ML systems often lead to silent degradations in +performance. Furthermore, the data that is monitored in ML is +literally the code used to train the next iteration of models.

                        +
                      • +
                      +

                      Because monitoring is so essential to ML systems, tightly integrating it +into the ML system architecture brings major benefits. In particular, +better integrating and monitoring practices, or creating an evaluation +store, can close the data flywheel loop, a concept we talked about +earlier in the class.

                      +

                      +

                      As we build models, we create a mapping between data and model. As the +data changes and we retrain models, monitoring these changes doesn’t +become an endpoint--it becomes a part of the entire model development +process. Monitoring, via an evaluation store, should touch all parts of +your stack. One challenge that this process helps solve is effectively +choosing which data points to collect, store, and label. Evaluation +stores can help identify which data to collect more points for based on +uncertain performance. As more data is collected and labeled, efficient +retraining can be performed using evaluation store guidance.

                      +

                      Conclusion

                      +

                      In summary, make sure to monitor your models!

                      +
                        +
                      • +

                        Something will always go wrong, and you should have a system to +catch errors.

                        +
                      • +
                      • +

                        Start by looking at data quality metrics and system metrics, as they +are easiest.

                        +
                      • +
                      • +

                        In a perfect world, the testing and monitoring should be linked, and +they should help you close the data flywheel.

                        +
                      • +
                      • +

                        There will be a lot of tooling and research that will hopefully come +soon!

                        +
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-12-notes-media/image1.png b/spring2021/lecture-12-notes-media/image1.png new file mode 100644 index 00000000..c49ed9be Binary files /dev/null and b/spring2021/lecture-12-notes-media/image1.png differ diff --git a/spring2021/lecture-12-notes-media/image10.png b/spring2021/lecture-12-notes-media/image10.png new file mode 100644 index 00000000..061e811d Binary files /dev/null and b/spring2021/lecture-12-notes-media/image10.png differ diff --git a/spring2021/lecture-12-notes-media/image11.png b/spring2021/lecture-12-notes-media/image11.png new file mode 100644 index 00000000..eb3c955e Binary files /dev/null and b/spring2021/lecture-12-notes-media/image11.png differ diff --git a/spring2021/lecture-12-notes-media/image12.png b/spring2021/lecture-12-notes-media/image12.png new file mode 100644 index 00000000..6c41ba43 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image12.png differ diff --git a/spring2021/lecture-12-notes-media/image13.png b/spring2021/lecture-12-notes-media/image13.png new file mode 100644 index 00000000..d23452cf Binary files /dev/null and b/spring2021/lecture-12-notes-media/image13.png differ diff --git a/spring2021/lecture-12-notes-media/image14.png b/spring2021/lecture-12-notes-media/image14.png new file mode 100644 index 00000000..64ecd2d0 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image14.png differ diff --git a/spring2021/lecture-12-notes-media/image15.png b/spring2021/lecture-12-notes-media/image15.png new file mode 100644 index 00000000..8f199a23 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image15.png differ diff --git a/spring2021/lecture-12-notes-media/image16.png b/spring2021/lecture-12-notes-media/image16.png new file mode 100644 index 00000000..cfa1cf8c Binary files /dev/null and b/spring2021/lecture-12-notes-media/image16.png differ diff --git a/spring2021/lecture-12-notes-media/image17.png b/spring2021/lecture-12-notes-media/image17.png new file mode 100644 index 00000000..aeb2f9d5 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image17.png differ diff --git a/spring2021/lecture-12-notes-media/image18.png b/spring2021/lecture-12-notes-media/image18.png new file mode 100644 index 00000000..283ef508 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image18.png differ diff --git a/spring2021/lecture-12-notes-media/image19.png b/spring2021/lecture-12-notes-media/image19.png new file mode 100644 index 00000000..35770986 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image19.png differ diff --git a/spring2021/lecture-12-notes-media/image2.png b/spring2021/lecture-12-notes-media/image2.png new file mode 100644 index 00000000..3c1001cf Binary files /dev/null and b/spring2021/lecture-12-notes-media/image2.png differ diff --git a/spring2021/lecture-12-notes-media/image20.png b/spring2021/lecture-12-notes-media/image20.png new file mode 100644 index 00000000..fb1050d6 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image20.png differ diff --git a/spring2021/lecture-12-notes-media/image21.png b/spring2021/lecture-12-notes-media/image21.png new file mode 100644 index 00000000..4d9f2d02 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image21.png differ diff --git a/spring2021/lecture-12-notes-media/image22.png b/spring2021/lecture-12-notes-media/image22.png new file mode 100644 index 00000000..93556858 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image22.png differ diff --git a/spring2021/lecture-12-notes-media/image3.png b/spring2021/lecture-12-notes-media/image3.png new file mode 100644 index 00000000..5f04978b Binary files /dev/null and b/spring2021/lecture-12-notes-media/image3.png differ diff --git a/spring2021/lecture-12-notes-media/image4.png b/spring2021/lecture-12-notes-media/image4.png new file mode 100644 index 00000000..0e3029e6 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image4.png differ diff --git a/spring2021/lecture-12-notes-media/image5.png b/spring2021/lecture-12-notes-media/image5.png new file mode 100644 index 00000000..feb51973 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image5.png differ diff --git a/spring2021/lecture-12-notes-media/image6.png b/spring2021/lecture-12-notes-media/image6.png new file mode 100644 index 00000000..73c3b1ec Binary files /dev/null and b/spring2021/lecture-12-notes-media/image6.png differ diff --git a/spring2021/lecture-12-notes-media/image7.png b/spring2021/lecture-12-notes-media/image7.png new file mode 100644 index 00000000..01b37c78 Binary files /dev/null and b/spring2021/lecture-12-notes-media/image7.png differ diff --git a/spring2021/lecture-12-notes-media/image8.png b/spring2021/lecture-12-notes-media/image8.png new file mode 100644 index 00000000..81c7bcfa Binary files /dev/null and b/spring2021/lecture-12-notes-media/image8.png differ diff --git a/spring2021/lecture-12-notes-media/image9.jpg b/spring2021/lecture-12-notes-media/image9.jpg new file mode 100644 index 00000000..0db26bec Binary files /dev/null and b/spring2021/lecture-12-notes-media/image9.jpg differ diff --git a/spring2021/lecture-12/index.html b/spring2021/lecture-12/index.html new file mode 100644 index 00000000..de63ca92 --- /dev/null +++ b/spring2021/lecture-12/index.html @@ -0,0 +1,3513 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 12: Research Directions - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 12: Research Directions

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Download notes as PDF

                      +

                      Lecture by Pieter Abbeel. +Notes transcribed by James Le +and Vishnu Rachakonda.

                      +

                      Of all disciplines, deep learning is probably the one where research +and practice are closest together. Often, something gets invented in +research and is put into production in less than a year. Therefore, it’s +good to be aware of research trends that you might want to incorporate +in projects you are working on.

                      +

                      Because the number of ML and AI papers increases exponentially, there’s +no way that you can read every paper. Thus, you need other methods to +keep up with research. This lecture provides a sampling of research +directions, the overall research theme running across these samples, and +advice on keeping up with the relentless flood of new research.

                      +

                      1 - Unsupervised Learning

                      +

                      Deep supervised learning, the default way of doing ML, works! But it +requires so much annotated data. Can we get around it by learning with +fewer labels? The answer is yes! And there are two major approaches: +deep semi-supervised learning and deep unsupervised learning.

                      +

                      Deep Semi-Supervised Learning

                      +

                      Semi-supervised means half supervised, half unsupervised. Assuming a +classification problem where each data point belongs to one of the +classes, we attempt to come up with an intuition to complete the +labeling for the unlabeled data points. One way to formalize this is: +If anything is close to a labeled example, then it will assume that +label. Thus, we can propagate the labels out from where they are given +to the neighboring data points.

                      +

                      How can we generalize the approach above to image classification?

                      +

                      +

                      Xie et al. (2020) proposes +Noisy Student Training:

                      +
                        +
                      • +

                        First, they train a teacher model with labeled data.

                        +
                      • +
                      • +

                        Then, they infer pseudo-labels on the unlabeled data. These are not + real labels, but those that they get from using the trained + teacher model.

                        +
                      • +
                      • +

                        Even though these labels are not perfect (because they train on a + small amount of labeled data), they can still see where they are + more confident about those pseudo labels and inject those into + their training set as additional labeled data.

                        +
                      • +
                      • +

                        When they retrain, they use dropout, data augmentation, and + stochastic depth to inject noise into the training process. + This enables the student model to be more robust and + generalizable.

                        +
                      • +
                      +

                      Deep Unsupervised Learning

                      +

                      Deep semi-supervised learning assumes that the labels in the supervised +dataset are still valid for the unsupervised dataset. There’s a limit to +the applicability because we assume that the unlabeled data is roughly +from the same distribution as the labeled data.

                      +

                      +

                      With deep unsupervised learning, we can transfer the learning with +multi-headed networks.

                      +
                        +
                      • +

                        First, we train a neural network. Then, we have two tasks and give + the network two heads - one for task 1 and another for task 2.

                        +
                      • +
                      • +

                        Most parameters live in the shared trunk of the network’s body. + Thus, when you train for task 1 and task 2, most of the learnings + are shared. Only a little bit gets specialized to task 1 versus + task 2.

                        +
                      • +
                      +

                      The key hypothesis here is that: For task 1 (which is unsupervised), +if the neural network is smart enough to do things like predicting the +next word in a sentence, generating realistic images, or translating +images from one scale to another; then that same neural network is ready +to do deep supervised learning from a very small dataset for task 2 +(what we care about).

                      +
                      GPT-2
                      +

                      For instance, task 1 could be predicting the next word in a sentence, +while task 2 could be predicting the sentiment in a corpus. OpenAI’s +GPT-2 is the +landmark result for next-word prediction where deep unsupervised +learning could work. The results were so realistic, and there was a lot +of press coverage. OpenAI deemed it to be too dangerous to be released +at the time.

                      +

                      +

                      Furthermore, GPT-2 can tackle complex common sense reasoning and +question answering tasks for various benchmarks. The table below +displays those benchmarks where GPT-2 was evaluated on. The details of +the tasks do not really matter. What’s more interesting is that: This is +the first time a model, trained unsupervised on a lot of text to predict +the next token and fine-tuned to specific supervised tasks, beats +prior methods that might have been more specialized to each of these +supervised tasks.

                      +

                      +

                      Another fascinating insight is that as we grow the number of model +parameters, the performance goes up consistently. This means with +unsupervised learning, we can incorporate much more data for larger +models. This research funding inspired OpenAI to fundraise $1B for +future projects to essentially have more compute available to train +larger models because it seems like doing that will lead to better +results. So far, that has been true +(GPT-3 performs better +than GPT-2).

                      +
                      BERT
                      +

                      BERT is Google’s approach +that came out around the same time as GPT-2. While GPT-2 predicts the +next word or token, BERT predicts a word or token that was removed. In +this task, the neural network looks at the entire corpus as it fills +things back in, which often helps in later tasks (as the neural network +has already been unsupervised-train on the entire text).

                      +

                      +

                      The table below displays BERT’s performance on the GLUE +benchmark. The takeaway message is not +so much in the details of these supervised tasks; but the fact that +these tasks have a relatively small amount of labeled data compared to +the unsupervised training that happens ahead of time. As BERT +outperformed all SOTA methods, it revolutionized how natural language +processing should be done.

                      +

                      +

                      BERT is one of the biggest updates that Google has made since RankBrain +in 2015 and has proven successful in comprehending the intent of the +searcher behind a search query.

                      +

                      Unsupervised Learning In Vision

                      +

                      Can we do the same thing for vision tasks? Let’s explore a few of them.

                      +
                        +
                      • +

                        Predict A Missing Patch: A patch is high-dimensional, so the + number of possibilities in that patch is very high (much larger + than the number of words in English, for instance). Therefore, + it’s challenging to predict precisely and make that work as well + as in languages.

                        +
                      • +
                      • +

                        Solve Jigsaw Puzzles: If the network can do this, it understands + something about images of the world. The trunk of the network + should hopefully be reusable.

                        +
                      • +
                      • +

                        Predict Rotation: Here, you collect random images and predict + what degree has been rotated. Existing methods work immensely well + for such a task.

                        +
                      • +
                      +

                      +

                      A technique that stood out in recent times is contrastive learning, +which includes two variants - +SimCLR (Chen et al., 2020) +and MoCo (He et al., 2019). +Here’s how you train your model with contrastive learning:

                      +
                        +
                      • +

                        Imagine that you download two images of a dog and a cat from the + Internet, and you don’t have labels yet.

                        +
                      • +
                      • +

                        You duplicate the dog image and make two versions of it (a greyscale + version and a cropped version).

                        +
                      • +
                      • +

                        For these two dog versions, the neural network should bring them + together while pushing the cat image far away.

                        +
                      • +
                      +

                      You then fine-tune with a simple linear classifier on top of training +completely unsupervised. This means that you must get the right features +extracted from the images during training. The results of contrastive +learning methods confirm that the higher the number of model parameters, +the better the accuracy.

                      +

                      2 - Reinforcement Learning

                      +

                      Reinforcement +learning +(RL) has not been practical yet but nevertheless has shown promising +results. In RL, the AI is an agent, more so than just a pattern +recognizer. The agent acts in an environment where it is goal-oriented. +It wants to achieve something during the process, which is represented +by a reward function.

                      +

                      +

                      Challenges

                      +

                      Compared to unsupervised learning, RL brings about a host of additional +challenges:

                      +
                        +
                      • +

                        Credit assignment: When the RL agent sees something, it has to + take action. But it is not told whether the action was good or bad + right away.

                        +
                      • +
                      • +

                        Stability: Because the RL agent learns by trial and error, it + can destabilize and make big mistakes. Thus, it needs to be clever + in updating itself not to destroy things along the way.

                        +
                      • +
                      • +

                        Exploration: The RL agent has to try things that have not been + done before.

                        +
                      • +
                      +

                      Despite these challenges, some great RL successes have happened.

                      +

                      Successes

                      +

                      DeepMind has shown that neural networks can learn to play the Atari +game back in 2013. Under the hood is the Deep +Q-Network +architecture, which was trained from its own trial-and-error, looking at +the score in the game to internalize what actions might be good or bad.

                      +

                      The game of Go was cracked by DeepMind - showing that the computer +can play better than the best human player +(AlphaGo, +AlphaGoZero, and +AlphaZero).

                      +

                      RL also works for the robot locomotion task. You don’t have to +design the controller yourself. You just implement the RL algorithm +(TRPO, +GAE, +DDPG, +PPO, and more) and let the +agent train itself, which is a general approach to have AI systems +acquire new skills. In fact, the robot can acquire such a variety of +skills, as demonstrated in this +DeepMimic +work.

                      +

                      +

                      You can also accomplish the above for non-human-like characters in +dynamic animation tasks. This is going to change how you can design +video games or animated movies. Instead of designing the keyframes for +every step along the way in your video or your game, you can train an +agent to go from point A to point B directly.

                      +

                      RL has been shown to work on real robots.

                      +
                        +
                      • +

                        BRETT (Berkeley + Robot for the Elimination of Tedious Tasks) could learn to put + blocks into matching openings in under an hour using a neural + network trained from scratch. This technique has been used for + NASA SuperBall + robots for space exploration ideas.

                        +
                      • +
                      • +

                        A similar idea was applied to robotic manipulation solving + Rubik’s cube, + done at OpenAI in 2019. The in-hand manipulation is a very + difficult robotic control problem that was mastered with RL.

                        +
                      • +
                      +

                      CovariantAI

                      +

                      +

                      The fact that RL worked so well actually inspired Pieter and his former +students (Tianhao Zhang, Rocky Duan, and Peter Chen) to start a company +called Covariant in 2017. Their goal is +to bring these advances from the lab into the real world. An example is +autonomous order +picking.

                      +

                      3 - Unsupervised Reinforcement Learning

                      +

                      RL achieved mastery on many simulated domains. But we must ask the +question: How fast is the learning itself? Tsividis et al., +2017 +shows that a human can learn in about 15 minutes to perform better than +Double DQN (a SOTA approach at the time of the study) learned after 115 +hours.

                      +

                      How can we bridge this learning gap?

                      +

                      Based on the 2018 DeepMind Control +Suite, pixel-based learning needs +50M more training steps than state-based learning to solve the same +tasks. Maybe we can develop an unsupervised learning approach to turn +pixel-level representations (which are not that informative) into a new +representation that is much more similar to the underlying state.

                      +

                      +

                      CURL brings together +contrastive learning and RL.

                      +
                        +
                      • +

                        In RL, there’s typically a replay buffer where we store the past + experiences. We load observations from there and feed them into an + encoder neural network. The network has two heads: an actor to + estimate the best action to take next and a critic to estimate how + good that action would be.

                        +
                      • +
                      • +

                        CURL adds an extra head at the bottom, which includes augmented + observations, and does contrastive learning on that. Similar + configurations of the robot are brought closer together, while + different ones are separated.

                        +
                      • +
                      +

                      The results confirm that CURL can match existing SOTA approaches that +learn from states and from pixels. However, it struggles in hard +environments, with insufficient labeled images being the root cause.

                      +

                      4 - Meta Reinforcement Learning

                      +

                      The majority of fully general RL algorithms work well for any +environments that can be mathematically defined. However, environments +encountered in the real world are a tiny subset of all environments that +could be defined. Maybe the learning takes such a long time because the +algorithms are too general. If they are a bit more specialized in +things they will encounter, perhaps the learning is faster.

                      +

                      Can we develop a fast RL algorithm to take advantage of this?

                      +

                      In traditional RL research, human experts develop the RL algorithm. +However, there are still no RL algorithms nearly as good as humans after +many years. Can we learn a better RL algorithm? Or even learn a better +entire agent?

                      +

                      RL^2

                      +

                      +

                      RL^2 (Duan et al., 2016) +is a meta-RL framework proposed to tackle this issue:

                      +
                        +
                      • +

                        Imagine that we have multiple meta-training environments (A, B, and + so on).

                        +
                      • +
                      • +

                        We also have a meta-RL algorithm that learns the RL algorithm and + outputs a “fast” RL agent (from having interacted with these + environments).

                        +
                      • +
                      • +

                        In the future, our agent will be in an environment F that is related + to A, B, and so on.

                        +
                      • +
                      +

                      Formally speaking, RL^2 maximizes the expected reward on the training +Markov Decision Process (MDP) but can generalize to testing MDP. The RL +agent is represented as a Recurrent Neural Network (RNN), a generic +computation architecture where:

                      +
                        +
                      • +

                        Different weights in the RNN mean different RL algorithms and + priors.

                        +
                      • +
                      • +

                        Different activations in the RNN mean different current policies.

                        +
                      • +
                      • +

                        The meta-trained objective can be optimized with an existing “slow” + RL algorithm.

                        +
                      • +
                      • +

                        The resulting RNN is ready to be dropped in a new environment.

                        +
                      • +
                      +

                      RL^2 was evaluated on a classic Multi-Armed Bandit setting and +performed better than provably (asymptotically) optimal RL algorithms +invented by humans like Gittings Index, UCB1, and Thompson Sampling. +Another task that RL^2 was evaluated on is visual navigation, where +the agent explores a maze and finds a specified target as quickly as +possible. Although this setting is maze-specific, we can scale up RL^2 +to other large-scale games and robotic environments and use it to learn +in a new environment quickly.

                      +

                      Learn More

                      + +

                      5 - Few-Shot Imitation Learning

                      +

                      People often complement RL with imitation learning, which is +basically supervised learning where the output is an action for an +agent. This gives you more signal than traditional RL since for every +input, you consistently have a corresponding output. As the diagram +below shows, the imitation learning algorithm learns a policy in a +supervised manner from many demonstrations and outputs the correct +action based on the environment.

                      +

                      +

                      The challenge for imitation learning is to collect enough +demonstrations to train an algorithm, which is time-consuming. To make +the collection of demonstrations more efficient, we can apply multi-task +meta-learning. Many demonstrations for different tasks can be learned by +an algorithm, whose output is fed to a one-shot imitator that picks the +correct action based on a single demonstration. This process is referred +to as one-shot imitation learning (Duan et al., +2017), as displayed below.

                      +

                      +

                      Conveniently, one-shot imitators are trained using traditional network +architectures. A combination of CNNs, RNNs, and MLPs perform the heavy +visual processing to understand the relevant actions in training demos +and recommend the right action for the current frame of an inference +demo. One example of this in action is block +stacking.

                      +

                      +

                      Learn More

                      + +

                      6 - Domain Randomization

                      +

                      Simulated data collection is a logical substitute for expensive real +data collection. It is less expensive, more scalable, and less dangerous +(e.g., in the case of robots) to capture at scale. Given this logic, +how can we make sure simulated data best matches real-world +conditions?

                      +

                      Use Realistic Simulated Data

                      +

                      +

                      One approach is to make the simulator you use for training models as +realistic as possible. Two variants of doing this are to carefully +match the simulation to the world (James and John, +2016; Johns, Leutenegger, and +Division, 2016; Mahler et +al., 2017; Koenemann et al., +2015) and +augment simulated data with real data (Richter et al., +2016; Bousmalis et al., +2017). While this option is +logically appealing, it can be hard and slow to do in practice.

                      +

                      Domain Confusion

                      +

                      +

                      Another option is domain confusion (Tzeng et al., +2014; Rusu et al., +2016).

                      +
                        +
                      • +

                        In this approach, suppose you train a model on real and simulated + data at the same time.

                        +
                      • +
                      • +

                        After completing training, a discriminator network examines the + original network at some layer to understand if the original + network is learning something about the real world.

                        +
                      • +
                      • +

                        If you can fool the discriminator with the output of the layer, the + original network has completely integrated its understanding of + real and simulated data.

                        +
                      • +
                      • +

                        In effect, there is no difference between simulated and real data to + the original network, and the layers following the examined layer + can be trained fully on simulated data.

                        +
                      • +
                      +

                      Domain Randomization

                      +

                      +

                      Finally, a simpler approach called domain randomization (Tobin +et al., 2017; Sadeghi and +Levine, 2016) has taken off of +late. In this approach, rather than making simulated data fully +realistic, the priority is to generate as much variation in the +simulated data as possible. For example, in the below tabletop scenes, +the dramatic variety of the scenes (e.g., background colors of green and +purple) can help the model generalize well to the real world, even +though the real world looks nothing like these scenes. This approach has +shown promise in drone flight +and pose estimation. The +simple logic of more data leading to better performance in real-world +settings is powerfully illustrated by domain randomization and obviates +the need for existing variation methods like pre-training on ImageNet.

                      +

                      7 - Deep Learning For Science and Engineering

                      +

                      AlphaFold

                      +

                      In other areas of this lecture, we’ve been focusing on research areas of +machine learning where humans already perform well (i.e., pose +estimation or grasping). In science and engineering applications, we +enter the realm of machine learning performing tasks humans cannot. The +most famous result is +AlphaFold, +a Deepmind-created system that solved protein folding, an important +biological challenge. In the CASP challenge, AlphaFold 2 far outpaced +all other results in performance. AlphaFold is quite complicated, as it +maps an input protein sequence to similar protein sequences and +subsequently decides the folding structure based on the evolutionary +history of complementary amino acids.

                      +

                      +

                      Other examples of DL systems solving science and engineering challenges +are in circuit design, +high-energy +physics, +and symbolic mathematics.

                      +

                      Learn More

                      + +

                      8 - Overarching Research Theme

                      +

                      As compute scales to support incredible numbers of FLOPs, more science +and engineering challenges will be solved with deep learning systems. +There has been exponential growth in the amount of compute used to +generate the most impressive research results like GPT-3.

                      +

                      +

                      As compute and data become more available, we open a new problem +territory that we can refer to as deep learning to learn. More +specifically, throughout history, the constraint on solving problems has +been human ingenuity. This is a particularly challenging realm to +contribute novel results to because we’re competing against the combined +intellectual might available throughout history. Is our present +ingenuity truly greater than that of others 20-30 years ago, let alone +200-300? Probably not. However, our ability to bring new tools like +compute and data most certainly is. Therefore, spending as much time in +this new problem territory, where data and compute help solve +problems, is likely to generate exciting and novel results more +frequently in the long run.

                      +

                      +

                      9 - How To Keep Up

                      +

                      Give a man a fish and you feed him for a day, teach a man to fish and +you feed him for a lifetime” (Lao Tzu)

                      +

                      Here are some tips on how to keep up with ML research:

                      +
                        +
                      • +

                        (Mostly) don’t read (most) papers. There are just too many!

                        +
                      • +
                      • +

                        When you do want to keep up, use the following:

                        + +
                      • +
                      • +

                        If you DO decide to read papers,

                        +
                          +
                        • +

                          Follow a principled process for reading papers

                          +
                        • +
                        • +

                          Use Arxiv Sanity

                          +
                        • +
                        • +

                          Twitter

                          +
                        • +
                        • +

                          AI/DL Facebook Group

                          +
                        • +
                        • +

                          ML Subreddit

                          +
                        • +
                        • +

                          Start a reading group: read papers together with friends - + either everyone reads then discusses, or one or two people + read and give tutorials to others.

                          +
                        • +
                        +
                      • +
                      +

                      +

                      Finally, should you do a Ph.D. or not?

                      +
                        +
                      • +

                        You don’t have to do a Ph.D. to work in AI!

                        +
                      • +
                      • +

                        However, if you REALLY want to become one of the world’s experts in + a topic you care about, then a Ph.D. is a technically deep and + demanding path to get there. Crudely speaking, a Ph.D. enables you + to develop new tools and techniques rather than using existing + tools and techniques.

                        +
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-13-notes-media/image1.png b/spring2021/lecture-13-notes-media/image1.png new file mode 100644 index 00000000..b869e42b Binary files /dev/null and b/spring2021/lecture-13-notes-media/image1.png differ diff --git a/spring2021/lecture-13-notes-media/image2.png b/spring2021/lecture-13-notes-media/image2.png new file mode 100644 index 00000000..a3098788 Binary files /dev/null and b/spring2021/lecture-13-notes-media/image2.png differ diff --git a/spring2021/lecture-13-notes-media/image3.png b/spring2021/lecture-13-notes-media/image3.png new file mode 100644 index 00000000..3e38f57e Binary files /dev/null and b/spring2021/lecture-13-notes-media/image3.png differ diff --git a/spring2021/lecture-13-notes-media/image4.png b/spring2021/lecture-13-notes-media/image4.png new file mode 100644 index 00000000..b784cecd Binary files /dev/null and b/spring2021/lecture-13-notes-media/image4.png differ diff --git a/spring2021/lecture-13-notes-media/image5.png b/spring2021/lecture-13-notes-media/image5.png new file mode 100644 index 00000000..464139d2 Binary files /dev/null and b/spring2021/lecture-13-notes-media/image5.png differ diff --git a/spring2021/lecture-13-notes-media/image6.png b/spring2021/lecture-13-notes-media/image6.png new file mode 100644 index 00000000..514e1056 Binary files /dev/null and b/spring2021/lecture-13-notes-media/image6.png differ diff --git a/spring2021/lecture-13-notes-media/image7.png b/spring2021/lecture-13-notes-media/image7.png new file mode 100644 index 00000000..cf997c36 Binary files /dev/null and b/spring2021/lecture-13-notes-media/image7.png differ diff --git a/spring2021/lecture-13/index.html b/spring2021/lecture-13/index.html new file mode 100644 index 00000000..e608dc0a --- /dev/null +++ b/spring2021/lecture-13/index.html @@ -0,0 +1,3444 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 13: ML Teams and Startups - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 13: ML Teams and Startups

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Download notes as PDF

                      +

                      Lecture by Josh Tobin. +Notes transcribed by James Le +and Vishnu Rachakonda.

                      +

                      Over the past few years, machine learning (ML) has grown tremendously. +But as young as ML is as a discipline, the craft of managing an ML team +is even younger. Many of today’s ML managers were thrust into management +roles out of necessity or because they were the best individual +contributors, and many come from purely academic backgrounds. At some +companies, engineering or product leaders are tasked with building new +ML functions without real ML experience.

                      +

                      Running any technical team is hard:

                      +
                        +
                      • +

                        You have to hire great people.

                        +
                      • +
                      • +

                        You need to manage and develop them.

                        +
                      • +
                      • +

                        You need to manage your team’s output and make sure your vectors are +aligned.

                        +
                      • +
                      • +

                        You would want to make good long-term technical choices and manage +technical debt.

                        +
                      • +
                      • +

                        You also must manage expectations from leadership.

                        +
                      • +
                      +

                      Running an ML team is even harder:

                      +
                        +
                      • +

                        ML talents are expensive and scarce.

                        +
                      • +
                      • +

                        ML teams have a diverse set of roles.

                        +
                      • +
                      • +

                        ML projects have unclear timelines and high uncertainty.

                        +
                      • +
                      • +

                        ML is also the “high-interest credit card of technical +debt."

                        +
                      • +
                      • +

                        Leadership often doesn’t understand ML.

                        +
                      • +
                      +

                      The goals of this lecture are two-fold: (1) to give you insight into how +to think about building and managing ML teams (as a leader); and (2) to +help you get a job in ML (as a newcomer).

                      +

                      +

                      1 - ML Roles

                      +

                      Common Roles

                      +

                      Let’s take a look at the most common ML roles and the skills they +require:

                      +
                        +
                      • +

                        The ML Product Manager works with the ML team, other business +functions, the end-users, and the data owners. This person designs +documentation, creates wireframes, and comes up with the plan to +prioritize and execute ML projects.

                        +
                      • +
                      • +

                        The DevOps Engineer deploys and monitors production systems. +This person handles the infrastructure that runs the deployed ML +product using platforms like AWS or GCP.

                        +
                      • +
                      • +

                        The Data Engineer builds data pipelines, aggregates and collects +data from storage, and monitors data behavior. This person works +with distributed systems using tools such as Hadoop, Kafka, +Airflow.

                        +
                      • +
                      • +

                        The ML Engineer trains and deploys prediction models. This +person uses tools like TensorFlow and Docker to work with +prediction systems running on real data in production.

                        +
                      • +
                      • +

                        The ML Researcher trains prediction models, often those that are +forward-looking or not production-critical. This person uses +libraries like TensorFlow and PyTorch on notebook environments to +build models and reports describing their experiments.

                        +
                      • +
                      • +

                        The Data Scientist is a blanket term used to describe all of the +roles above. In some organizations, this role entails answering +business questions via analytics. He/she can work with +wide-ranging tools from SQL and Excel to Pandas and Scikit-Learn.

                        +
                      • +
                      +

                      Skills Required

                      +

                      +

                      So what skills are needed for these roles? The chart above displays a +nice visual, where the horizontal axis is the level of ML expertise and +the size of the bubble is the level of communication and technical +writing (the bigger, the better).

                      +
                        +
                      • +

                        The ML DevOps is primarily a software engineering role, which +often comes from a standard software engineering pipeline.

                        +
                      • +
                      • +

                        The Data Engineer belongs to the software engineering team that +works actively with ML teams.

                        +
                      • +
                      • +

                        The ML Engineer requires a rare mix of ML and Software +Engineering skills. This person is either an engineer with +significant self-teaching OR a science/engineering Ph.D. who works +as a traditional software engineer after graduate school.

                        +
                      • +
                      • +

                        The ML Researcher is an ML expert who usually has an MS or Ph.D. +degree in Computer Science or Statistics or finishes an industrial +fellowship program.

                        +
                      • +
                      • +

                        The ML Product Manager is just like a traditional Product +Manager, but with a deep knowledge of the ML development process +and mindset.

                        +
                      • +
                      • +

                        The Data Scientist role constitutes a wide range of backgrounds +from undergraduate to Ph.D. students.

                        +
                      • +
                      +

                      2 - ML Organizations

                      +

                      Organization Archetypes

                      +

                      There exists not yet a consensus on the right way to structure an ML +team. Still, a few best practices are contingent upon different +organization archetypes and their ML maturity level. First, let’s see +what the different ML organization archetypes are.

                      +
                      Archetype 1 - Nascent and Ad-hoc ML
                      +
                        +
                      • +

                        These are organizations where no one is doing ML, or ML is done on +an ad-hoc basis. Obviously, there is little ML expertise in-house.

                        +
                      • +
                      • +

                        They are either small-to-medium businesses or less +technology-forward large companies in industries like education or +logistics.

                        +
                      • +
                      • +

                        There is often low-hanging fruit for ML.

                        +
                      • +
                      • +

                        But there is little support for ML projects, and it’s challenging to +hire and retain good talent.

                        +
                      • +
                      +
                      Archetype 2 - Research and Development ML
                      +
                        +
                      • +

                        These are organizations in which ML efforts are centered in the R&D +arm of the organization. They often hire ML researchers and +doctorate students with experience publishing papers.

                        +
                      • +
                      • +

                        They are larger companies in sectors such as oil and gas, +manufacturing, or telecommunications.

                        +
                      • +
                      • +

                        They can hire experienced researchers and work on long-term business +priorities to get big wins.

                        +
                      • +
                      • +

                        However, it is very difficult to get quality data. Most often, this +type of research work rarely translates into actual business +value, so usually, the amount of investment remains small.

                        +
                      • +
                      +
                      Archetype 3 - Product-Embedded ML
                      +
                        +
                      • +

                        These are organizations where certain product teams or business +units have ML expertise alongside their software or analytics +talent. These ML individuals report up to the team’s +engineering/tech lead.

                        +
                      • +
                      • +

                        They are either software companies or financial services companies.

                        +
                      • +
                      • +

                        ML improvements are likely to lead to business value. Furthermore, +there is a tight feedback cycle between idea iteration and product +improvement.

                        +
                      • +
                      • +

                        Unfortunately, it is still very hard to hire and develop top talent, +and access to data and compute resources can lag. There are also +potential conflicts between ML project cycles and engineering +management, so long-term ML projects can be hard to justify.

                        +
                      • +
                      +
                      Archetype 4 - Independent ML Division
                      +
                        +
                      • +

                        These are organizations in which the ML division reports directly to +senior leadership. The ML Product Managers work with Researchers +and Engineers to build ML into client-facing products. They can +sometimes publish long-term research.

                        +
                      • +
                      • +

                        They are often large financial services companies.

                        +
                      • +
                      • +

                        Talent density allows them to hire and train top practitioners. +Senior leaders can marshal data and compute resources. This gives +the organizations to invest in tooling, practices, and culture +around ML development.

                        +
                      • +
                      • +

                        A disadvantage is that model handoffs to different business lines +can be challenging since users need the buy-in to ML benefits and +get educated on the model use. Also, feedback cycles can be slow.

                        +
                      • +
                      +
                      Archetype 5 - ML-First
                      +
                        +
                      • +

                        These are organizations in which the CEO invests in ML, and there +are experts across the business focusing on quick wins. The ML +division works on challenging and long-term projects.

                        +
                      • +
                      • +

                        They are large tech companies and ML-focused startups.

                        +
                      • +
                      • +

                        They have the best data access (data thinking permeates the +organization), the most attractive recruiting funnel (challenging +ML problems tends to attract top talent), and the easiest +deployment procedure (product teams understand ML well enough).

                        +
                      • +
                      • +

                        This type of organization archetype is hard to implement in practice +since it is culturally difficult to embed ML thinking everywhere.

                        +
                      • +
                      +

                      Team Structure Design Choices

                      +

                      Depending on the above archetype that your organization resembles, you +can make the appropriate design choices, which broadly speaking follow +these three categories:

                      +
                        +
                      1. +

                        Software Engineer vs. Research: To what extent is the ML team +responsible for building or integrating with software? How +important are Software Engineering skills on the team?

                        +
                      2. +
                      3. +

                        Data Ownership: How much control does the ML team have over data +collection, warehousing, labeling, and pipelining?

                        +
                      4. +
                      5. +

                        Model Ownership: Is the ML team responsible for deploying models +into production? Who maintains the deployed models?

                        +
                      6. +
                      +

                      Below are our design suggestions:

                      +

                      If your organization focuses on ML R&D:

                      +
                        +
                      • +

                        Research is most definitely prioritized over Software Engineering +skills. Because of this, there would potentially be a lack of +collaboration between these two groups.

                        +
                      • +
                      • +

                        ML team has no control over the data and typically will not have +data engineers to support them.

                        +
                      • +
                      • +

                        ML models are rarely deployed into production.

                        +
                      • +
                      +

                      If your organization has ML embedded into the product:

                      +
                        +
                      • +

                        Software Engineering skills will be prioritized over Research +skills. Often, the researchers would need strong engineering +skills since everyone would be expected to product-ionize his/her +models.

                        +
                      • +
                      • +

                        ML teams generally do not own data production and data management. +They will need to work with data engineers to build data +pipelines.

                        +
                      • +
                      • +

                        ML engineers totally own the models that they deploy into +production.

                        +
                      • +
                      +

                      If your organization has an independent ML division:

                      +
                        +
                      • +

                        Each team has a potent mix of engineering and research skills; +therefore, they work closely together within teams.

                        +
                      • +
                      • +

                        ML team has a voice in data governance discussions, as well as a +robust data engineering function.

                        +
                      • +
                      • +

                        ML team hands-off models to users but is still responsible for +maintaining them.

                        +
                      • +
                      +

                      If your organization is ML-First:

                      +
                        +
                      • +

                        Different teams are more or less research-oriented, but in general, +research teams collaborate closely with engineering teams.

                        +
                      • +
                      • +

                        ML team often owns the company-wide data infrastructure.

                        +
                      • +
                      • +

                        ML team hands the models to users, who are responsible for operating +and maintaining them.

                        +
                      • +
                      +

                      The picture below neatly sums up these suggestions:

                      +

                      +

                      3 - Managing ML Teams

                      +

                      Managing ML Teams Is Challenging

                      +

                      The process of actually managing an ML team is quite challenging for +four reasons:

                      +
                        +
                      1. +

                        Engineering Estimation: It’s hard to know how easy or hard an ML +project is in advance. As you explore the data and experiment with +different models, there is enormous scope for new learnings about +the problem that materially impact the timeline. Furthermore, +knowing what methods will work is often impossible. This makes it +hard to say upfront how long or how much work may go into an ML +project.

                        +
                      2. +
                      3. +

                        Nonlinear Progress: As the chart below from a blog +post +by Lukas Biewald (CEO of Weights and Biases) shows, progress on ML +projects is unpredictable over time, even when the effort expended +grows considerably. It’s very common for projects to stall for +extended periods of time.

                        +
                      4. +
                      +

                      +
                        +
                      1. +

                        Cultural gaps: The relative culture of engineering and research +professionals is very different. Research tends to favor novel, +creative ideas, while engineering prefers tried and true methods +that work. As a result, ML teams often experience a clash of +cultures, which can turn toxic if not appropriately managed. A +core challenge of running ML teams is addressing the cultural +barriers between ML and software engineering so that teams can +harmoniously experiment and deliver ML products.

                        +
                      2. +
                      3. +

                        Leadership Deficits: It’s common to see a lack of detailed +understanding of ML at senior levels of management in many +companies. As a result, expressing feasibility and setting the +right expectations for ML projects, especially high-priority ones, +can be hard.

                        +
                      4. +
                      +

                      How To Manage ML Teams Better

                      +

                      Managing ML teams is hardly a solved problem, but you can take steps to +improve the process.

                      +
                      Plan Probabilistically
                      +

                      Many engineering projects are managed in a waterfall fashion, with the +sequential tasks defined up front clearly. Instead of forcing this +method of engineering management on difficult ML projects, try assigning +a likelihood of success to different tasks to better capture the +experimental process inherent to ML engineering. As these tasks progress +or stall, rapidly re-evaluate your task ordering to better match what is +working. Having this sense of both (1) how likely a task is to +succeed and (2) how important it is makes project planning +considerably more realistic.

                      +
                      +

                      +
                      +
                      Have A Portfolio Of Approaches
                      +

                      Embrace multiple ideas and approaches to solve crucial research +challenges that gate production ML. Don’t make your plan dependent on +one approach working!

                      +
                      Measure Inputs, Not Results
                      +

                      As you work through several approaches in your portfolio, do not overly +emphasize whose ideas ultimately work as a reflection of contribution +quality. This can negatively impact team members’ creativity, as they +focus more on trying to find only what they currently think could work, +rather than experimenting in a high-quality fashion (which is ultimately +what leads to ML success).

                      +
                      Have Researchers and Engineers Work Together
                      +

                      The collaboration between engineering and research is essential for +quality ML products to get into production. Emphasize collaboration +across the groups and professionals!

                      +
                      Get End-to-end Pipelines Together Quickly to Demonstrate Quick Wins
                      +

                      Taking this approach makes it more likely that your ML project will +succeed in the long term. It allows you to demonstrate progress to your +leadership more effectively and clearly.

                      +
                      Educate Leadership on ML Timeline Uncertainty
                      +

                      This can be hard, as leadership is ultimately accountable for addressing +blind spots and understanding timeline risk. There are things you can +do, however, to help improve leadership’s knowledge about ML timelines. +Avoid building hype around narrow progress metrics material only to the +ML team (e.g., “We improved F1 score by 0.2 and have achieved awesome +performance!”). Instead, be realistic, communicate risk, and emphasize +real product impact (e.g., “Our model improvements should increase the +number of conversions by 10%, though we must continue to validate its +performance on additional demographic factors.) Sharing resources like +this a16z +primer +and this class from Prof. Pieter +Abbeel +can increase awareness of your company’s leadership.

                      +

                      4 - Hiring/Getting Hired

                      +

                      The AI Talent Gap

                      +

                      With the novelty of ML systems, it’s fair to say that not many people +have built real ML systems. Estimates vary from as few as 10,000 +(Element AI) to as many as 200-300,000 people (Tencent). Whatever way +you slice the numbers (contained in this blog +post), +the reality is that there is not much-experienced talent in the AI/ML +field, especially compared to the number of trained software developers +in the US (3.6M) or in the world (18.2M).

                      +

                      Sourcing

                      +

                      Because of this shallow talent pool and the skyrocketing demand, hiring +for ML positions is pretty hard. Typical ML roles come in the following +structure:

                      +
                        +
                      • +

                        ML Adjacent roles: ML product manager, DevOps, Data Engineer

                        +
                      • +
                      • +

                        Core ML Roles: ML Engineer, ML Research/ML Scientist

                        +
                      • +
                      • +

                        Business analytics roles: Data Scientist

                        +
                      • +
                      +

                      For ML adjacent roles, traditional ML knowledge is less important, as +demonstrated interest, conversational understanding, and experience can +help these professionals play an impactful role on ML teams. Let’s focus +on how to hire for the core ML roles.

                      +

                      While there’s no perfect way to hire ML engineers, there’s +definitely a wrong way to hire them, with extensive job descriptions +that demand only the best qualifications. Certainly, there are many good +examples of this bad practice floating around.

                      +
                        +
                      • +

                        Rather than this unrealistic process, consider hiring for software +engineering skills, an interest in ML, and a desire to learn. +You can always train people in the art and science of ML, +especially when they come with strong software engineering +fundamentals.

                        +
                      • +
                      • +

                        Another option is to consider adding junior talent, as many +recent grads come out with good ML knowledge nowadays.

                        +
                      • +
                      • +

                        Finally, and most importantly, be more specific about what you +need the position and professional to do. It’s impossible to find +one person that can do everything from full-fledged DevOps to +algorithm development.

                        +
                      • +
                      +

                      To hire ML researchers, here are our tips:

                      +
                        +
                      • +

                        Evaluate the quality of publications, over the quantity, with an eye +towards the originality of the ideas, the execution, etc.

                        +
                      • +
                      • +

                        Prioritize researchers that focus on important problems instead of +trendy problems.

                        +
                      • +
                      • +

                        Experience outside academia is also a positive, as these researchers +may be able to transition to industry more effectively.

                        +
                      • +
                      • +

                        Finally, keep an open mind about research talent and consider +talented people without PhDs or from adjacent fields like physics, +statistics, etc.

                        +
                      • +
                      +

                      To find quality candidates for these roles, some ideas for sourcing +are:

                      +
                        +
                      • +

                        To experiment with standard job recruiting avenues like LinkedIn, +Hired, recruiters, on-campus-recruiting, etc.

                        +
                      • +
                      • +

                        To monitor arXiv and top conferences and flag first authors of +papers you like.

                        +
                      • +
                      • +

                        To look for good implementations of papers you like.

                        +
                      • +
                      • +

                        To attend ML research conferences (NeurIPS, ICML, etc.)

                        +
                      • +
                      +

                      As you seek to recruit, stay on top of what professionals want and make +an effort to position your company accordingly. ML practitioners want to +be empowered to do great work with interesting data. Building a culture +of learning and impact can help recruit the best talent to your team. +Additionally, sell sell sell! Talent needs to know how good your team is +and how meaningful the mission can be.

                      +

                      +

                      Interviewing

                      +

                      As you interview candidates for ML roles, try to validate your +hypotheses of their strengths while testing a minimum bar on weaker +aspects. For example, make sure ML researchers can think creatively +about new ML problems while ensuring they meet a baseline for code +quality. It’s essential to test both ML knowledge and software +engineering skill for all industry professionals, though the relative +strengths can vary.

                      +

                      The actual ML interview process is much less well-defined than software +engineering interviews, though it is modeled off of it. Some helpful +inclusions are projects or exercises that test the ability to work with +ML-specific code, like take-home ML projects.

                      +

                      +

                      Finding A Job

                      +

                      To find an ML job, you can take a look at the following sources:

                      +
                        +
                      • +

                        Standard sources such as LinkedIn, recruiters, on-campus recruiting, +etc.

                        +
                      • +
                      • +

                        ML research conferences (NeurIPS, ICLR, ICML).

                        +
                      • +
                      • +

                        Apply directly (remember, there’s a talent gap!).

                        +
                      • +
                      +

                      Standing out for competitive roles can be tricky! Here are some tips in +increasing order of impressiveness that you can apply to differentiate +yourself:

                      +
                        +
                      1. +

                        Build software engineering skills (e.g., at a well-known software +company).

                        +
                      2. +
                      3. +

                        Exhibit ML interest (e.g., conference attendance, online courses +certificates, etc.).

                        +
                      4. +
                      5. +

                        Show you have a broad knowledge of ML (e.g., write blog posts +synthesizing a research area).

                        +
                      6. +
                      7. +

                        Demonstrate ability to get ML projects done (e.g., create side +projects, re-implement papers).

                        +
                      8. +
                      9. +

                        Prove you can think creatively in ML (e.g., win Kaggle competitions, +publish papers).

                        +
                      10. +
                      +

                      As you prepare for interviews, prepare for both the traditional ML +theoretical topics and the general software engineering interview (e.g., +read Cracking the Coding +Interview).

                      +

                      5 - Conclusion

                      +

                      Being a new and evolving discipline for most traditional organizations, +forming ML teams is full of known and unknown challenges. Here are the +final few take-homes:

                      +
                        +
                      • +

                        There are many different skills involved in production ML, so +there are opportunities for many people to contribute.

                        +
                      • +
                      • +

                        ML teams are becoming more standalone and more +interdisciplinary.

                        +
                      • +
                      • +

                        Managing ML teams is complex. There is no silver bullet, but +shifting toward probabilistic planning can help.

                        +
                      • +
                      • +

                        ML talent is scarce. As a manager, be specific about what skills +are must-have in the ML job descriptions. As a job seeker, it can +be brutally challenging to break in as an outsider, so use +projects as a signal to build awareness.

                        +
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-2a/index.html b/spring2021/lecture-2a/index.html new file mode 100644 index 00000000..ae5fb1b2 --- /dev/null +++ b/spring2021/lecture-2a/index.html @@ -0,0 +1,2296 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 2A: CNNs - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lecture 2A: CNNs

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Sergey Karayev.

                      +

                      In this video, we first review convolution operation, the most basic property of Convolutional Neural Networks. Then, we look at other important operations for ConvNets. Finally, we transition to looking at a classic ConvNet architecture called LeNet.

                      +
                        +
                      • 00:00 - Introduction
                      • +
                      • 01:08 - Convolutional Filters
                      • +
                      • 07:10 - Filter Stacks and ConvNets
                      • +
                      • 11:25 - Strides and Padding
                      • +
                      • 14:35 - Filter Math
                      • +
                      • 21:44 - Convolution Implementation Notes
                      • +
                      • 24:04 - Increasing the Receptive Field with Dilated Convolutions
                      • +
                      • 27:30 - Decreasing the Tensor Size with Pooling and 1x1-Convolutions
                      • +
                      • 30:54 - LeNet Architecture
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-2b/index.html b/spring2021/lecture-2b/index.html new file mode 100644 index 00000000..ec61a8be --- /dev/null +++ b/spring2021/lecture-2b/index.html @@ -0,0 +1,2302 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 2B: Computer Vision - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lecture 2B: Computer Vision

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Sergey Karayev.

                      +

                      In this video, we will review notable applications of deep learning in computer vision. First, we will tour some ConvNet architectures. Then, we will talk about localization, detection, and segmentation problems. We will conclude with more advanced methods.

                      +

                      Learn more at this website: https://paperswithcode.com/area/computer-vision

                      +
                        +
                      • 00:00 - Introduction
                      • +
                      • 02:51 - AlexNet
                      • +
                      • 05:09 - ZFNet
                      • +
                      • 06:54 - VGGNet
                      • +
                      • 09:06 - GoogLeNet
                      • +
                      • 11:57 - ResNet
                      • +
                      • 15:15 - SqueezeNet
                      • +
                      • 17:05 - Architecture Comparisons
                      • +
                      • 20:00 - Localization, Detection, and Segmentation Tasks
                      • +
                      • 24:00 - Overfeat, YOLO, and SSD Methods
                      • +
                      • 28:01 - Region Proposal Methods (R-CNN, Faster R-CNN, Mask R-CNN, U-Net)
                      • +
                      • 34:33 - Advanced Tasks (3D Shape Inference, Face Landmark Recognition, and Pose Estimation)
                      • +
                      • 37:00 - Adversarial Attacks
                      • +
                      • 40:56 - Style Transfer
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-3/index.html b/spring2021/lecture-3/index.html new file mode 100644 index 00000000..9a46b908 --- /dev/null +++ b/spring2021/lecture-3/index.html @@ -0,0 +1,2295 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 3: RNNs - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lecture 3: RNNs

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Josh Tobin.

                      +
                        +
                      • 00:00 - Introduction
                      • +
                      • 01:34 - Sequence Problems
                      • +
                      • 06:28 - Review of RNNs
                      • +
                      • 22:00 - Vanishing Gradient Issue
                      • +
                      • 27:52 - LSTMs and Its Variants
                      • +
                      • 34:10 - Bidirectionality and Attention from Google's Neural Machine Translation
                      • +
                      • 46:38 - CTC Loss
                      • +
                      • 52:12 - Pros and Cons of Encoder-Decoder LSTM Architectures
                      • +
                      • 54:55 - WaveNet
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-4/index.html b/spring2021/lecture-4/index.html new file mode 100644 index 00000000..c9089994 --- /dev/null +++ b/spring2021/lecture-4/index.html @@ -0,0 +1,2296 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 4: Transformers - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Lecture 4: Transformers

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Sergey Karayev.

                      +

                      In this video, you will learn about the origin of transfer learning in computer vision, its application in NLP in the form of embedding, NLP's ImageNet moment, and the Transformers model families.

                      +
                        +
                      • 00:00 - Introduction
                      • +
                      • 00:42 - Transfer Learning in Computer Vision
                      • +
                      • 04:00 - Embeddings and Language Models
                      • +
                      • 10:09 - NLP's ImageNet moment: ELMO and ULMFit on datasets like SQuAD, SNLI, and GLUE
                      • +
                      • 16:49 - Rise of Transformers
                      • +
                      • 18:20 - Attention in Detail: (Masked) Self-Attention, Positional Encoding, and Layer Normalization
                      • +
                      • 27:33 - Transformers Variants: BERT, GPT/GPT-2/GPT-3, DistillBERT, T5, etc.
                      • +
                      • 36:20 - GPT3 Demos
                      • +
                      • 42:53 - Future Directions
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-5-notes-media/image1.png b/spring2021/lecture-5-notes-media/image1.png new file mode 100644 index 00000000..03ed93e6 Binary files /dev/null and b/spring2021/lecture-5-notes-media/image1.png differ diff --git a/spring2021/lecture-5-notes-media/image2.png b/spring2021/lecture-5-notes-media/image2.png new file mode 100644 index 00000000..c3163f1b Binary files /dev/null and b/spring2021/lecture-5-notes-media/image2.png differ diff --git a/spring2021/lecture-5-notes-media/image3.png b/spring2021/lecture-5-notes-media/image3.png new file mode 100644 index 00000000..bd9b9bca Binary files /dev/null and b/spring2021/lecture-5-notes-media/image3.png differ diff --git a/spring2021/lecture-5-notes-media/image4.png b/spring2021/lecture-5-notes-media/image4.png new file mode 100644 index 00000000..1bd60c15 Binary files /dev/null and b/spring2021/lecture-5-notes-media/image4.png differ diff --git a/spring2021/lecture-5-notes-media/image5.png b/spring2021/lecture-5-notes-media/image5.png new file mode 100644 index 00000000..af208b21 Binary files /dev/null and b/spring2021/lecture-5-notes-media/image5.png differ diff --git a/spring2021/lecture-5-notes-media/image6.png b/spring2021/lecture-5-notes-media/image6.png new file mode 100644 index 00000000..01a22a48 Binary files /dev/null and b/spring2021/lecture-5-notes-media/image6.png differ diff --git a/spring2021/lecture-5-notes-media/image7.png b/spring2021/lecture-5-notes-media/image7.png new file mode 100644 index 00000000..b9f6f647 Binary files /dev/null and b/spring2021/lecture-5-notes-media/image7.png differ diff --git a/spring2021/lecture-5-notes-media/image8.png b/spring2021/lecture-5-notes-media/image8.png new file mode 100644 index 00000000..cc1d7032 Binary files /dev/null and b/spring2021/lecture-5-notes-media/image8.png differ diff --git a/spring2021/lecture-5/index.html b/spring2021/lecture-5/index.html new file mode 100644 index 00000000..cdca1722 --- /dev/null +++ b/spring2021/lecture-5/index.html @@ -0,0 +1,3313 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 5: ML Projects - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 5: ML Projects

                      +

                      Learn how to set up Machine Learning projects like a pro. This includes an understanding of the ML lifecycle, an acute mind of the feasibility and impact, an awareness of the project archetypes, and an obsession with metrics and baselines.

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Josh Tobin. +Notes transcribed by James Le and Vishnu Rachakonda.

                      +

                      1 - Why Do ML Projects Fail?

                      +

                      Based on a report from +TechRepublic +a few years back, despite increased interest in adopting machine +learning (ML) in the enterprise, 85% of machine learning projects +ultimately fail to deliver on their intended promises to business. +Failure can happen for many reasons; however, a few glaring dangers will +cause any AI project to crash and burn.

                      +
                        +
                      • +

                        ML is still very much a research endeavor. Therefore it is very + challenging to aim for a 100% success rate.

                        +
                      • +
                      • +

                        Many ML projects are technically infeasible or poorly scoped.

                        +
                      • +
                      • +

                        Many ML projects never leap production, thus getting stuck at the + prototype phase.

                        +
                      • +
                      • +

                        Many ML projects have unclear success criteria because of a lack of + understanding of the value proposition.

                        +
                      • +
                      • +

                        Many ML projects are poorly managed because of a lack of interest + from leadership.

                        +
                      • +
                      +

                      2 - Lifecycle

                      +

                      It’s essential to understand what constitutes all of the activities in a +machine learning project. Typically speaking, there are four major +phases:

                      +
                        +
                      1. +

                        Planning and Project Setup: At this phase, we want to decide the problem to work on, determine the requirements and goals, figure out how to allocate resources properly, consider the ethical implications, etc.

                        +
                      2. +
                      3. +

                        Data Collection and Labeling: At this phase, we want to collect training data and potentially annotate them with ground truth, depending on the specific sources where they come from. We may find that it’s too hard to get the data, or it might be easier to label for a different task. If that’s the case, go back to phase 1.

                        +
                      4. +
                      5. +

                        Model Training and Model Debugging: At this phase, we want to implement baseline models quickly, find and reproduce state-of-the-art methods for the problem domain, debug our implementation, and improve the model performance for specific tasks. We may realize that we need to collect more data or that labeling is unreliable (thus, go back to phase 2). Or we may recognize that the task is too challenging and there is a tradeoff between project requirements (thus, go back to phase 1).

                        +
                      6. +
                      7. +

                        Model Deploying and Model Testing: At this phase, we want to pilot the model in a constrained environment (i.e., in the lab), write tests to prevent regressions, and roll the model into production. We may see that the model doesn’t work well in the lab, so we want to keep improving the model’s accuracy (thus, go back to phase 3). Or we may want to fix the mismatch between training data and production data by collecting more data and mining hard cases (thus go back to phase 2). Or we may find out that the metric picked doesn’t actually drive downstream user behavior, and/or performance in the real world isn’t great. In such situations, we want to revisit the projects’ metrics and requirements (thus, go back to phase 1).

                        +
                      8. +
                      +

                      +

                      Besides the per-project activities mentioned above, there are two other +things that any ML team will need to solve across any projects they get +involved with: (1) building the team and hiring people; and (2) setting +up infrastructure and tooling to build ML systems repeatedly and at +scale.

                      +

                      Additionally, it might be useful to understand state-of-the-art results +in your application domain so that you know what’s possible and what to +try next.

                      +

                      3 - Prioritizing Projects

                      +

                      To prioritize projects to work on, you want to find high-impact problems +and assess the potential costs associated with them. The picture below +shows a general framework that encourages us to target projects with +high impact and high feasibility.

                      +

                      +

                      High Impact

                      +

                      There are no silver bullets to find high-impact ML problems to work on, +but here are a few useful mental models:

                      +
                        +
                      • +

                        Where can you take advantage of cheap prediction?

                        +
                      • +
                      • +

                        Where is there friction in your product?

                        +
                      • +
                      • +

                        Where can you automate complicated manual processes?

                        +
                      • +
                      • +

                        What are other people doing?

                        +
                      • +
                      +

                      Cheap Prediction

                      +

                      In the book “Prediction +Machines,” +the authors (Ajay Agrawal, Joshua Gans, and Avi Goldfarb) come up with +an excellent mental model on the economics of Artificial Intelligence: +As AI reduces the cost of prediction and prediction is central for +decision making, cheap predictions would be universal for problems +across business domains. Therefore, you should look for projects where +cheap predictions will have a huge business impact.

                      +

                      Product Needs

                      +

                      Another lens is to think about what your product needs. In the article +“Three Principles for Designing ML-Powered +Products,” +the Spotify Design team emphasizes the importance of building ML from a +product perspective and looking for parts of the product experience +with high friction. Automating those parts is exactly where there is a +lot of impact for ML to make your business better.

                      +

                      ML Strength

                      +

                      In his popular blog post “Software +2.0,” +Andrej Karpathy contrasts software 1.0 (which are traditional programs +with explicit instructions) and software 2.0 (where humans specify +goals, while the algorithm searches for a program that works). Software +2.0 programmers work with datasets, which get compiled via +optimization — which works better, more general, and less +computationally expensive. Therefore, you should look for complicated +rule-based software where we can learn the rules instead of programming +them.

                      +

                      Inspiration From Others

                      +

                      Instead of reinventing the wheel, you can look at what other companies +are doing. In particular, check out papers from large frontier +organizations (Google, Facebook, Nvidia, Netflix, etc.) and blog posts +from top earlier-stage companies (Uber, Lyft, Spotify, Stripe, etc.).

                      +

                      Here is a list of excellent ML use cases to check out (credit to Chip +Huyen’s ML Systems Design Lecture 2 +Note):

                      + +

                      High Feasibility

                      +

                      The three primary cost drivers of ML projects in order of importance are +data availability, accuracy requirement, and problem difficulty.

                      +

                      +

                      Data Availability

                      +

                      Here are the questions you need to ask concerning the data availability:

                      +
                        +
                      • +

                        How hard is it to acquire data?

                        +
                      • +
                      • +

                        How expensive is data labeling?

                        +
                      • +
                      • +

                        How much data will be needed?

                        +
                      • +
                      • +

                        How stable is the data?

                        +
                      • +
                      • +

                        What are the data security requirements?

                        +
                      • +
                      +

                      Accuracy Requirement

                      +

                      Here are the questions you need to ask concerning the accuracy +requirement:

                      +
                        +
                      • +

                        How costly are wrong predictions?

                        +
                      • +
                      • +

                        How frequently does the system need to be right to be useful?

                        +
                      • +
                      • +

                        What are the ethical implications?

                        +
                      • +
                      +

                      It is worth noting that ML project costs tend to scale +super-linearly in the accuracy requirement. The fundamental reason is +that you typically need a lot more data and more high-quality labels to +achieve high accuracy numbers.

                      +

                      Problem Difficulty

                      +

                      Here are the questions you need to ask concerning the problem +difficulty:

                      +
                        +
                      • +

                        Is the problem well-defined?

                        +
                      • +
                      • +

                        Is there good published work on similar problems?

                        +
                      • +
                      • +

                        What are the computing requirements?

                        +
                      • +
                      • +

                        Can a human do it?

                        +
                      • +
                      +

                      So what’s still hard in machine learning? As a caveat, it’s historically +very challenging to predict what types of problems will be difficult for +ML to solve in the future. But generally speaking, both unsupervised +learning and reinforcement learning are still hard, even though +they show promise in limited domains where tons of data and compute are +available.

                      +

                      Zooming into supervised learning, here are three types of hard +problems:

                      +
                        +
                      • +

                        Output is complex: These are problems where the output is + high-dimensional or ambiguous. Examples include 3D reconstruction, + video prediction, dialog systems, open-ended recommendation + systems, etc.

                        +
                      • +
                      • +

                        Reliability is required: These are problems where high precision + and robustness are required. Examples include systems that can + fail safely in out-of-distribution scenarios, is robust to + adversarial attacks, or needs to tackle highly precise tasks.

                        +
                      • +
                      • +

                        Generalization is required: These are problems with + out-of-distribution data or in the domains of reasoning, planning, + and causality. Examples include any systems for self-driving + vehicles or any systems that deal with small data.

                        +
                      • +
                      +

                      Finally, this is a nice checklist for you to run an ML feasibility +assessment:

                      +
                        +
                      • +

                        Are you sure that you need ML at all?

                        +
                      • +
                      • +

                        Put in the work upfront to define success criteria with all of the + stakeholders.

                        +
                      • +
                      • +

                        Consider the ethics of using ML.

                        +
                      • +
                      • +

                        Do a literature review.

                        +
                      • +
                      • +

                        Try to build a labeled benchmark dataset rapidly.

                        +
                      • +
                      • +

                        Build a minimal viable product with manual rules

                        +
                      • +
                      • +

                        Are you “really sure” that you need ML at all?

                        +
                      • +
                      +

                      4 - Archetypes

                      +

                      So far, we’ve talked about the lifecycle and the impact of all machine +learning projects. Ultimately, we generally want these projects, or +applications of machine learning, to be useful for products. As we +consider how ML can be applied in products, it’s helpful to note that +there are common machine learning product archetypes or recurrent +patterns through which machine learning is applied to products. You can +think of these as “mental models” you can use to assess your project and +easily prioritize the needed resources.

                      +

                      There are three common archetypes in machine learning projects: +Software 2.0, Human-in-the-loop, and autonomous systems. +They are shown in the table below, along with common examples and +questions. We’ll dive deeper into each.

                      + + + + + + + + + + + + + + + + + + + + + + + + + +
                      ArchetypeExamplesQuestions
                      Software 2.0

                      - Improve code completion in IDE

                      +

                      - Build customized recommendation system

                      +

                      - Build a better video game AI

                      - Do your models truly improve performance?

                      +

                      - Does performance improvement generate business value?

                      +

                      - Do performance improvements lead to a data flywheel?

                      Human-in-the-loop

                      - Turn sketches into slides

                      +

                      - Email auto-completion

                      +

                      - Help radiologists do job faster

                      - How good does the system need to be to be useful?

                      +

                      - How can you collect enough data to make it good?

                      Autonomous Systems

                      - Full self-driving

                      +

                      - Automated customer support

                      +

                      - Automated website design

                      - What is an acceptable failure rate for the system?

                      +

                      - How can you guarantee that it won’t exceed the failure rate?

                      +

                      - How inexpensively can you label data from the system?

                      + +

                      Software 2.0

                      +

                      Software 2.0, which we previously alluded to from the Karpathy +article, is +defined as “augmenting existing rules-based or deterministic software +with machine learning, a probabilistic approach.” Examples of this are +taking a code completer in an IDE and improving the experience for the +user by adding an ML component. Rather than suggesting a command based +solely on the leading characters the programmer has written, you might +add a model that suggests commands based on previous commands the +programmer has written.

                      +

                      As you build a software 2.0 project, strongly consider the concept of +the data flywheel. For certain ML projects, as you improve your +model, your product will get better and more users will engage with the +product, thereby generating more data for the model to get even better. +It’s a classic virtuous cycle and truly the gold standard for ML +projects.

                      +

                      +

                      In embarking on creating a data flywheel, critically consider where the +model could fail in relation to your product. For example, do more users +lead to collecting more data that is useful for improving your model? An +actual system needs to be set up to capture this data and ensure that +it's meaningful for the ML lifecycle. Furthermore, consider whether more +data will lead to a better model (your job as an ML practitioner) or +whether a better model and better predictions will actually lead to +making the product better. Ideally, you should have a quantitative +assessment of what makes your product “better” and map model improvement +to it.

                      +

                      Human-in-the-Loop (HIL)

                      +

                      HIL systems are defined as machine learning systems where the output +of your model will be reviewed by a human before being executed in the +real world. For example, consider translating sketches into slides. An +ML algorithm can take a sketch’s input and suggest to a user a +particular slide design. Every output of the ML model is considered and +executed upon by a human, who ultimately has to decide on the slide’s +design.

                      +

                      Autonomous Systems

                      +

                      Autonomous systems are defined as machine learning systems where the +system itself makes decisions or engages in outputs that are almost +never reviewed by a human. Canonically, consider the self-driving car!

                      +

                      Feasibility

                      +

                      +

                      Let’s discuss how the product archetypes relate back to project +priority. In terms of feasibility and impact, the two axes on which +we consider priority, software 2.0 tends to have high feasibility but +potentially low impact. The existing system is often being optimized +rather than wholly replaced. However, this status with respect to +priority is not static by any means. Building a data flywheel into your +software 2.0 project can improve your product’s impact by improving the +model’s performance on the task and future ones.

                      +

                      In the case of human-in-the-loop systems, their feasibility and impact +sit squarely in between autonomous systems and software 2.0. HIL +systems, in particular, can benefit disproportionately in their +feasibility and impact from effective product design, which naturally +takes into account how humans interact with technology and can mitigate +risks for machine learning model behavior. Consider how the Facebook +photo tagging algorithm is implemented. Rather than tagging the user +itself, the algorithm frequently asks the user to tag themselves. This +effective product design allows the model to perform more effectively in +the user’s eye and reduces the impact of false classifications. +Grammarly similarly solicits user +input as part of its product design through offering explanations. +Finally, recommender systems also implement this idea. In general, +good product design can smooth the rough edges of ML (check out the +concept of designing collaborative +AI).

                      +

                      There are industry-leading resources that can help you merge product +design and ML. Apple’s ML product design +guidelines +suggest three key questions to anyone seeking to put ML into a product:

                      +
                        +
                      1. +

                        What role does ML play in your product?

                        +
                      2. +
                      3. +

                        How can you learn from your users?

                        +
                      4. +
                      5. +

                        How should your app handle mistakes?

                        +
                      6. +
                      +

                      Associated with each question is a set of design paradigms that help +address the answers to each question. There are similarly great +resources from +Microsoft +and +Spotify.

                      +

                      Finally, autonomous systems can see their priority improved by improving +their feasibility. Specifically, you can add humans in the loop or +reduce the system’s natural autonomy to improve its feasibility. In the +case of self-driving cars, many companies add safety drivers as +guardrails to improve autonomous systems. In +Voyage’s case, they take a more dramatic +approach of constraining the problem for the autonomous system: they +only run self-driving cars in senior living communities, a narrow subset +of the broader self-driving problem.

                      +

                      5 - Metrics

                      +

                      So far, we’ve talked about the overall ideas around picking projects and +structuring them based on their archetypes and the specific +considerations that go into them. Now, we’ll shift gears and be a little +more tactical to focus on metrics and baselines, which will help you +execute projects more effectively.

                      +

                      Choosing a Metric

                      +

                      Metrics help us evaluate models. There’s a delicate balance between +the real world (which is always messy and multifaceted) and the machine +learning paradigm (which optimizes a single metric) in choosing a +metric. In practical production settings, we often care about multiple +dimensions of performance (i.e., accuracy, speed, cost, etc.). The +challenge is to reconcile all the possible evaluation methods with the +reality that ML systems work best at optimizing a single number. How can +we balance these competing needs in building an ML product?

                      +

                      As you start evaluating models, choose a single metric to focus on +first, such as precision, accuracy, recall, etc. This can serve as an +effective first filter of performance. Subsequently, you can put +together a formula that combines all the metrics you care about. Note +that it’s important to be flexible and regularly update this formula as +your models or the requirements for the product change.

                      +

                      Combining Metrics

                      +

                      Two simple ways of combining metrics into a formula are averaging +and thresholding.

                      +

                      Averaging is less common but easy and intuitive; you can just take a +simple average or a weighted average of the model’s metrics and pick the +highest average.

                      +

                      More practically, you can apply a threshold evaluation to the model’s +metrics. In this method, out of n evaluation metrics, you threshold n-1 +and optimize the nth metric. For example, if we look at a model’s +precision, memory requirement, and cost to train, we might threshold the +memory requirement (no more than X MB) and the cost (no more than $X) +and optimize precision (as high as possible). As you choose which +metrics to threshold and what to set their threshold values to, make +sure to consider domain-specific needs and the actual values of the +metrics (how good/bad they might be).

                      +

                      +

                      6 - Baselines

                      +

                      In any product development process, setting expectations properly is +vital. For machine learning products, baselines help us set +expectations for how well our model will perform. In particular, +baselines set a useful lower bound for our model’s performance. What’s +the minimum expectation we should have for a model’s performance? The +better defined and clear the baseline is, the more useful it is for +setting the right expectations. Examples of baselines are human +performance on a similar task, state-of-the-art models, or even simple +heuristics.

                      +

                      Baselines are especially important for helping decide the next steps. +Consider the example below of two models with the same loss curve but +differing performance with respect to the baseline. Clearly, they +require different action items! As seen below, on the left, where we are +starting to approach or exceed the baseline, we need to be mindful of +overfitting and perhaps incorporate regularization of some sort. On the +right, where the baseline hugely exceeds our model’s performance, we +clearly have a lot of work to do to improve the model and address its +underfitting.

                      +

                      +

                      There are a number of sources to help us define useful baselines. +Broadly speaking, there are external baselines (baselines defined by +others) or internal baselines you can define yourself. With internal +baselines, in particular, you don’t need anything too complicated, or +even something with ML! Simple tests like averaging across your dataset +can help you understand if your model is achieving meaningful +performance. If your model can’t exceed a simple baseline like this, you +might need to really re-evaluate the model.

                      +

                      Human baselines are a particularly powerful form of baseline since we +often seek to replace or augment human actions. In creating these +baselines, note that there’s usually an inverse relationship between the +quality of the baseline and the ease of data collection. In a nutshell, +the harder it is to get a human baseline, the better and more useful +it probably is.

                      +

                      +

                      For example, a Mechanical Turk-created baseline is easy to generate +nowadays, but the quality might be hit or miss because of the variance +in annotators. However, trained, specialized annotators can be hard to +acquire, but the specificity of their knowledge translates into a great +baseline. Choosing where to situate your baseline on this range, from +low quality/easy to high quality/hard, depends on the domain. +Concentrating data collection strategically, ideally in classes where +the model is least performant, is a simple way of improving the quality +of the baseline.

                      +

                      TLDR

                      +
                        +
                      1. +

                        Machine learning projects are iterative. Deploy something fast to + begin the cycle.

                        +
                      2. +
                      3. +

                        Choose projects with high impact and low cost of wrong predictions.

                        +
                      4. +
                      5. +

                        The secret sauce to make projects work well is to build automated + data flywheels.

                        +
                      6. +
                      7. +

                        In the real world, you care about many things, but you should always + have just one to work on.

                        +
                      8. +
                      9. +

                        Good baselines help you invest your effort the right way.

                        +
                      10. +
                      +

                      Further Resources

                      + + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-6-notes-media/Infra-Tooling1.png b/spring2021/lecture-6-notes-media/Infra-Tooling1.png new file mode 100644 index 00000000..5b99e591 Binary files /dev/null and b/spring2021/lecture-6-notes-media/Infra-Tooling1.png differ diff --git a/spring2021/lecture-6-notes-media/Infra-Tooling2.png b/spring2021/lecture-6-notes-media/Infra-Tooling2.png new file mode 100644 index 00000000..28b7c97d Binary files /dev/null and b/spring2021/lecture-6-notes-media/Infra-Tooling2.png differ diff --git a/spring2021/lecture-6-notes-media/Infra-Tooling3.png b/spring2021/lecture-6-notes-media/Infra-Tooling3.png new file mode 100644 index 00000000..e7cd953f Binary files /dev/null and b/spring2021/lecture-6-notes-media/Infra-Tooling3.png differ diff --git a/spring2021/lecture-6-notes-media/Infra-Tooling4.png b/spring2021/lecture-6-notes-media/Infra-Tooling4.png new file mode 100644 index 00000000..24a8140d Binary files /dev/null and b/spring2021/lecture-6-notes-media/Infra-Tooling4.png differ diff --git a/spring2021/lecture-6-notes-media/Infra-Tooling5.png b/spring2021/lecture-6-notes-media/Infra-Tooling5.png new file mode 100644 index 00000000..07083dcb Binary files /dev/null and b/spring2021/lecture-6-notes-media/Infra-Tooling5.png differ diff --git a/spring2021/lecture-6-notes-media/Infra-Tooling6.png b/spring2021/lecture-6-notes-media/Infra-Tooling6.png new file mode 100644 index 00000000..43f7699d Binary files /dev/null and b/spring2021/lecture-6-notes-media/Infra-Tooling6.png differ diff --git a/spring2021/lecture-6-notes-media/Infra-Tooling7.png b/spring2021/lecture-6-notes-media/Infra-Tooling7.png new file mode 100644 index 00000000..4424b9bb Binary files /dev/null and b/spring2021/lecture-6-notes-media/Infra-Tooling7.png differ diff --git a/spring2021/lecture-6/index.html b/spring2021/lecture-6/index.html new file mode 100644 index 00000000..608722e9 --- /dev/null +++ b/spring2021/lecture-6/index.html @@ -0,0 +1,2772 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 6: MLOps Infrastructure & Tooling - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 6: MLOps Infrastructure & Tooling

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Sergey Karayev. +Notes transcribed by James Le +and Vishnu Rachakonda.

                      +

                      1 - Dream vs. Reality for ML Practitioners

                      +

                      The dream of ML practitioners is that we are provided the data, and somehow we build an optimal machine learning prediction system available as a scalable API or an edge deployment. That deployment then generates more data for us, which can be used to improve our system.

                      +

                      The reality is that you will have to:

                      +
                        +
                      • Aggregate, process, clean, label, and version the data
                      • +
                      • Write and debug model code
                      • +
                      • Provision compute
                      • +
                      • Run many experiments and review the results
                      • +
                      • Discover that you did something wrong or maybe try a different architecture -> Write more code and provision more compute
                      • +
                      • Deploy the model when you are happy
                      • +
                      • Monitor the predictions that the model makes on production data so that you can gather some good examples and feed them back to the initial data flywheel loop
                      • +
                      +

                      +

                      For example, the slide above is from Andrej Karpathy’s talk at PyTorch Devcon 2019 discussing Tesla’s self-driving system. Their dream is to build a system that goes from the data gathered through their training, evaluation, and inference processes and gets deployed on the cars. As people drive, more data will be collected and added back to the training set. As this process repeats, Tesla’s ML engineers can all go on vacation :)

                      +

                      +

                      The picture above (from the famous Google paper “Machine Learning: The High-Interest Credit Card of Technical Debt”) shows that the ML code portion in a real-world ML system is a lot smaller than the infrastructure needed for its support. As ML projects move from small-scale research experiments to large-scale industry deployments, your organization most likely will require a massive amount of infrastructure to support large inferences, distributed training, data processing pipelines, reproducible experiments, model monitoring, etc.

                      +

                      2 - Three Buckets of Tooling Landscape

                      +

                      +

                      We can break down the landscape of all this necessary infrastructure into three buckets: data, training/evaluation, and deployment.

                      +
                        +
                      • The data bucket includes the data sources, data lakes/warehouses, data processing, data exploration, data versioning, and data labeling.
                      • +
                      • The training/evaluation bucket includes compute sources, resource management, software engineering, frameworks and distributed training libraries, experiment management, and hyper-parameter tuning.
                      • +
                      • The deployment bucket includes continuous integration and testing, edge deployment, web deployment, monitoring, and feature store.
                      • +
                      +

                      There are also several vendors offering “all-in-one” MLOps solutions that cover all three buckets. This lecture focuses on the training/evaluation bucket.

                      +

                      3 - Software Engineering

                      +

                      When it comes to writing deep learning code, Python is the clear programming language of choice. As a general-purpose language, Python is easy to learn and easily accessible, enabling you to find skilled developers on a faster basis. It has various scientific libraries for data wrangling and machine learning (Pandas, NumPy, Scikit-Learn, etc.). Regardless of whether your engineering colleagues write code in a lower-level language like C, C++, or Java, it is generally neat to join different components with a Python wrapper.

                      +

                      When choosing your IDEs, there are many options out there (Vim, Emacs, Sublime Text, Jupyter, VS Code, PyCharm, Atom, etc.). Each of these has its uses in any application, and you’re better to switch between them to remain agile without relying heavily on shortcuts and packages. It also helps teams work better if they can jump into different IDEs and comment/collaborate with other colleagues. In particular, Visual Studio Code makes for a very nice Python experience, where you have access to built-in git staging and diffing, peek at documentation, linter code as you write, and open projects remotely.

                      +

                      Jupyter Notebooks have rapidly grown in popularity among data scientists to become the standard for quick prototyping and exploratory analysis. For example, Netflix based all of their machine learning workflows on them, effectively building a whole notebook infrastructure to leverage them as a unifying layer for scheduling workflows. Jeremy Howard develops his fast.ai codebase entirely with notebooks and introduces a project called nbdev that shows people how to develop well-tested code in a notebook environment.

                      +

                      However, there are many problems with using notebooks as a last resort when working in teams that aim to build machine/deep learning products. Alexander Mueller's blog post outlines the five reasons why they suck:

                      +
                        +
                      • It is challenging to enable good code versioning because notebooks are big JSON files that cannot be merged automatically.
                      • +
                      • Notebook “IDE” is primitive, as they have no integration, no lifting, and no code-style correction. Data scientists are not software engineers, and thus, tools that govern their code quality and help improve it are very important.
                      • +
                      • It is very hard to structure code reasonably, put code into functions, and develop tests while working in notebooks. You better develop Python scripts based on test-driven development principles as soon as you want to reproduce some experiments and run notebooks frequently.
                      • +
                      • Notebooks have out-of-order execution artifacts, meaning that you can easily destroy your current working state when jumping between cells of notebooks.
                      • +
                      • It is also difficult to run long or distributed tasks. If you want to handle big datasets, better pull your code out of notebooks, start a Python folder, create fixtures, write tests, and then deploy your application to a cluster.
                      • +
                      +

                      +

                      Recently, a new application framework called Streamlit was introduced. The creators of the framework wanted machine learning engineers to be able to create beautiful apps without needing a tools team; in other words, these internal tools should arise as a natural byproduct of the machine learning workflow. According to the launch blog post, here are the core principles of Streamlit:

                      +
                        +
                      • Embrace Python scripting: Streamlit apps are just scripts that run from top to bottom. There’s no hidden state. You can factor your code with function calls. If you know how to write Python scripts, you can write Streamlit apps.
                      • +
                      • Treat widgets as variables: There are no callbacks in Streamlit. Every interaction simply reruns the script from top to bottom. This approach leads to a clean codebase.
                      • +
                      • Reuse data and computation: Streamlit introduces a cache primitive that behaves like a persistent, immutable-by-default data store that lets Streamlit apps safely and effortlessly reuse information.
                      • +
                      +

                      Right now, Streamlit is building features that enable sharing machine learning projects to be as easy as pushing a web app to Heroku.

                      +

                      We recommend using conda to set up your Python and CUDA environments and pip-tools to separate mutually compatible versions of all requirements for our lab.

                      +

                      4 - Compute Hardware

                      +

                      We can break down the compute needs into an early-stage development step and a late-stage training/evaluation step.

                      +
                        +
                      • During the development stage, we write code, debug models, and look at the results. It’d be nice to be able to compile and train models via an intuitive GUI quickly.
                      • +
                      • During the training/evaluation stage, we design model architecture, search for hyper-parameters, and train large models. It’d be nice to launch experiments and review results easily.
                      • +
                      +

                      Compute matters with each passing year due to the fact that the results came out of deep learning are using more and more compute (check out this 2018 report from OpenAI). Looking at recent Transformer models, while OpenAI’s GPT-3 has not been fully commercialized yet, Google already released the Switch Transformer with orders of magnitude larger in the number of parameters.

                      +

                      So should you get your own hardware, go straight to the cloud, or use on-premise options?

                      +

                      GPU Basics

                      +

                      This is basically an NVIDIA game, as they are the only provider of good deep learning GPUs. However, Google’s TPUs are the fastest, which is available only on GCP.

                      +

                      There is a new NVIDIA architecture every year: Kepler -> Pascal -> Volta -> Turing -> Ampere. NVIDIA often released the server version of the cards first, then the “enthusiast” version, and finally the consumer version. If you use these cards for business purposes, then you suppose to use the server version.

                      +

                      GPUs have a different amount of RAM. You can only compute on the data that is on the GPU memory. The more data you can fit on the GPU, the larger your batches are, the faster your training goes.

                      +

                      For deep learning, you use 32-bit precision. In fact, starting with the Volta architecture, NVIDIA developed tensor cores that are specifically designed for deep learning operations (mixed-precision between 32 and 16 bit). Tensor Cores reduce the used cycles needed for calculating multiply and addition operations and the reliance on repetitive shared memory access, thus saving additional cycles for memory access. This is very useful for the convolutional/Transformer models that are prevalent nowadays.

                      +

                      +

                      Let’s go through different GPU architectures:

                      +
                        +
                      • Kepler/Maxwell: They are 2-4x slower than the Pascal/Volta ones below. You should not buy these old guards (K80).
                      • +
                      • Pascal: They are in the 1080 Ti cards from 2017, which are still useful if bought used (especially for recurrent neural networks). P100 is the equivalent cloud offering.
                      • +
                      • Volta/Turing: These are the preferred choices over the Kepler and Pascal because of their support for 16-bit mixed-precision via tensor cores. Hardware options are 2080 Ti and Titan RTX, while the cloud option is V100.
                      • +
                      • Ampere: This architecture is available in the latest hardware (3090) and cloud (A100) offerings. They have the most tensor cores, leading to at least 30% speedup over Turing.
                      • +
                      +

                      You can check out this recent GPU benchmark from Lambda Labs and consult Tim Dettmers’ advice on which GPUs to get.

                      +

                      Cloud Options

                      +

                      Amazon Web Services, Google Cloud Platform, and Microsoft Azure are the cloud heavyweights with largely similar functions and prices. There are also startups like Lambda Labs and Corewave that provide cloud GPUs.

                      +

                      On-Prem Options

                      +

                      You can either build your own or buy pre-built devices from vendors like Lambda Labs, NVIDIA, Supermicro, Cirrascale, etc.

                      +

                      Recommendations

                      +

                      Even though the cloud is expensive, it’s hard to make on-prem devices scale past a certain point. Furthermore, dev-ops things are easier to be done in the cloud than to be set up by yourself. And if your machine dies or requires maintenance, that will be a constant headache if you are responsible for managing it.

                      +

                      Here are our recommendations for three profiles:

                      +
                        +
                      • Hobbyists: Build your own machine (maybe a 4x Turing or a 2x Ampere PC) during development. Either use the same PC or use cloud instances during training/evaluation.
                      • +
                      • Startups: Buy a sizeable Lambda Labs machine for every ML scientist during development. Buy more shared server machines or use cloud instances during training/evaluation.
                      • +
                      • Larger companies: Buy an even more powerful machine for every ML scientist during development. Use cloud with fast instances with proper provisioning and handling of failures during training/evaluation.
                      • +
                      +

                      5 - Resource Management

                      +

                      With all the resources we have discussed (compute, dependencies, etc.), our challenge turns to manage them across the specific use cases we may have. Across all the resources, our goal is always to be able to easily experiment with the necessary resources to achieve the desired application of ML for our product.

                      +

                      For this challenge of allocating resources to experimenting users, there are some common solutions:

                      +
                        +
                      1. Script a solution ourselves: In theory, this is the simplest solution. We can check if a resource is free and then lock it if a particular user is using it or wants to.
                      2. +
                      3. SLURM: If we don't want to write the script entirely ourselves, standard cluster job schedulers like SLURM can help us. The workflow is as follows: First, a script defines a job’s requirements. Then, the SLURM queue runner analyzes this and then executes the jobs on the correct resource.
                      4. +
                      5. Docker/Kubernetes: The above approach might still be too manual for your needs, in which case you can turn to Docker/Kubernetes. Docker packages the dependency stack into a lighter-than-VM package called a container (that excludes the OS). Kubernetes lets us run these Docker containers on a cluster. In particular, Kubeflow is an OSS project started by Google that allows you to spawn/manage Jupyter notebooks and manage multi-step workflows. It also has lots of plug-ins for extra processes like hyperparameter tuning and model deployment. However, Kubeflow can be a challenge to setup.
                      6. +
                      7. Custom ML software: There’s a lot of novel work and all-in-one solutions being developed to provision compute resources for ML development efficiently. Platforms like AWS Sagemaker, Paperspace Gradient, and Determined AI are advancing. Newer startups like Anyscale and Grid.AI (creators of PyTorch Lightning) are also tackling this. Their vision is around allowing you to seamlessly go from training models on your computer to running lots of training jobs in the cloud with a simple set of SDK commands.
                      8. +
                      +

                      6 - Frameworks and Distributed Training

                      +

                      Deep Learning Frameworks

                      +

                      If you’ve built a deep learning model in the last few years, you’ve probably used a deep learning framework. Frameworks like TensorFlow have crucially shaped the development of the deep learning revolution. The reality is that deep learning frameworks have existed for a while. Projects like Theano and Torch have been around for 10+ years. In contemporary use, there are three main frameworks we’ll focus on - TensorFlow, Keras, and PyTorch. We evaluate frameworks based on their utility for production and development.

                      +

                      +

                      When TensorFlow came out in 2015, it was billed heavily as a production-optimized DL framework with an underlying static optimized graph that could be deployed across compute environments. However, TF 1.0 had a pretty unpleasant development experience; in addition to developing your models, you had to consider the underlying execution graph you were describing. This kind of “meta-development” posed a challenge for newcomers. The Keras project solved many of these issues by offering a simpler way to define models, and eventually became a part of TensorFlow. PyTorch, when it was introduced in 2017, offered a polar opposite to TensorFlow. It made development super easy by consisting almost exclusively of simple Python commands, but was not designed to be fast at scale.

                      +

                      Using TF/Keras or PyTorch is the current recommended way to build deep learning models unless you have a powerful reason not to. Essentially, both have converged to pretty similar points that balance development and production. TensorFlow adopted eager execution by default and became a lot easier to develop quickly in. PyTorch subsumed Caffe2 and became much faster as a result, specifically by adding the ability to compile speedier model artifacts. Nowadays, PyTorch has a lot of momentum, likely due to its ease of development. Newer projects like fast.ai and PyTorch Lighting add best practices and additional functionality to PyTorch, making it even more popular. According to this 2018 article on The Gradient, more than 80% of submissions are in PyTorch in academic projects.

                      +

                      All these frameworks may seem like excessive quibbling, especially since PyTorch and TensorFlow have converged in important ways. Why do we even require such extensive frameworks?

                      +

                      It’s theoretically possible to define entire models and their required matrix math (e.g., a CNN) in NumPy, the classic Python numerical computing library. However, we quickly run into two challenges: back-propagating errors through our model and running the code on GPUs, which are powerful computation accelerators. For these issues to be addressed, we need frameworks to help us with auto-differentiation, an efficient way of computing the gradients, and software compatibility with GPUs, specifically interfacing with CUDA. Frameworks allow us to abstract the work required to achieve both features, while also layering in valuable abstractions for all the latest layer designs, optimizers, losses, and much more. As you can imagine, the abstractions offered by frameworks save us valuable time on getting our model to run and allow us to focus on optimizing our model.

                      +

                      New projects like JAX and HuggingFace offer different or simpler abstractions. JAX focuses primarily on fast numerical computation with autodiff and GPUs across machine learning use cases (not just deep learning). HuggingFace abstracts entire model architectures in the NLP realm. Instead of loading individual layers, HuggingFace lets you load the entirety of a contemporary mode (along with weights)l like BERT, tremendously speeding up development time. HuggingFace works on both PyTorch and TensorFlow.

                      +

                      Distributed Training

                      +

                      Distributed training is a hot topic as the datasets and the models we train become too large to work on a single GPU. It’s increasingly a must-do. The important thing to note is that distributed training is a process to conduct a single model training process; don’t confuse it with training multiple models on different GPUs. There are two approaches to distributed training: data parallelism and model parallelism.

                      +
                      Data Parallelism
                      +

                      Data parallelism is quite simple but powerful. If we have a batch size of X samples, which is too large for one GPU, we can split the X samples evenly across N GPUs. Each GPU calculates the gradients and passes them to a central node (either a GPU or a CPU), where the gradients are averaged and backpropagated through the distributed GPUs. This paradigm generally results in a linear speed-up time (e.g., two distributed GPUs results in a ~2X speed-up in training time). In modern frameworks like PyTorch, PyTorch Lightning, and even in schedulers like SLURM, data-parallel training can be achieved simply by specifying the number of GPUs or calling a data parallelism-enabling object (e.g., torch.nn.DataParallel). Other tools like Horovod (from Uber) use non-framework-specific ways of enabling data parallelism (e.g., MPI, a standard multiprocessing framework). Ray, the original open-source project from the Anyscale team, was designed to enable general distributed computing applications in Python and can be similarly applied to data-parallel distributed training.

                      +
                      Model Parallelism
                      +

                      Model parallelism is a lot more complicated. If you can’t fit your entire model’s weights on a single GPU, you can split the weights across GPUs and pass data through each to train the weights. This usually adds a lot of complexity and should be avoided unless absolutely necessary. A better solution is to pony up for the best GPU available, either locally or in the cloud. You can also use gradient checkpointing, a clever trick wherein you write some gradients to disk as you compute them and load them only as you need them for updates.  New work is coming out to make this easier (e.g., research and framework maturity).

                      +

                      7 - Experiment Management

                      +

                      As you run numerous experiments to refine your model, it’s easy to lose track of code, hyperparameters, and artifacts. Model iteration can lead to lots of complexity and messiness. For example, you could be monitoring the learning rate’s impact on your model’s performance metric. With multiple model runs, how will you monitor the impact of the hyperparameter?

                      +

                      A low-tech way would be to manually track the results of all model runs in a spreadsheet. Without great attention to detail, this can quickly spiral into a messy or incomplete artifact. Dedicated experiment management platforms are a remedy to this issue. Let’s cover a few of the most common ones:

                      +
                        +
                      • TensorBoard: This is the default experiment tracking platform that comes with TensorFlow. As a pro, it’s easy to get started with. On the flip side, it’s not very good for tracking and comparing multiple experiments. It’s also not the best solution to store past work.
                      • +
                      • MLFlow: An OSS project from Databricks, MLFlow is a complete platform for the ML lifecycle. They have great experiment and model run management at the core of their platform. Another open-source project, Keepsake, recently came out focused solely on experiment tracking.
                      • +
                      • Paid platforms (Comet.ml, Weights and Biases, Neptune): Finally, outside vendors offer deep, thorough experiment management platforms, with tools like code diffs, report writing, data visualization, and model registering features. In our labs, we will use Weights and Biases.
                      • +
                      +

                      8 - Hyperparameter Tuning

                      +

                      To finalize models, we need to ensure that we have the optimal hyperparameters. Since hyperparameter optimization (as this process is called) can be a particularly compute-intensive process, it’s useful to have software that can help. Using specific software can help us kill underperforming model runs with bad hyperparameters early (to save on cost) or help us intelligently sweep ranges of hyperparameter values. Luckily, there’s an increasing number of software providers that do precisely this:

                      +
                        +
                      • SigOpt offers an API focused exclusively on efficient, iterative hyperparameter optimization. Specify a range of values, get SigOpt’s recommended hyperparameter settings, run the model and return the results to SigOpt, and repeat the process until you’ve found the best parameters for your model.
                      • +
                      • Rather than an API, Ray Tune offers a local software (part of the broader Ray ecosystem) that integrates hyperparameter optimization with compute resource allocation. Jobs are scheduled with specific hyperparameters according to state-of-the-art methods, and underperforming jobs are automatically killed.
                      • +
                      • Weights and Biases also has this feature! With a YAML file specification, we can specify a hyperparameter optimization job and perform a “sweep,” during which W&B sends parameter settings to individual “agents” (our machines) and compares performance.
                      • +
                      +

                      9 - “All-In-One” Solutions

                      +

                      Some platforms integrate all the aspects of the applied ML stack we’ve discussed (experiment tracking, optimization, training, etc.) and wrap them into a single experience. To support the “lifecycle,” these platforms typically include:

                      +
                        +
                      • Labeling and data querying services
                      • +
                      • Model training, especially though job scaling and scheduling
                      • +
                      • Experiment tracking and model versioning
                      • +
                      • Development environments, typically through notebook-style interfaces
                      • +
                      • Model deployment (e.g., via REST APIs) and monitoring
                      • +
                      +

                      One of the earliest examples of such a system is Facebook’s FBLearner (2016), which encompassed data and feature storage, training, inference, and continuous learning based on user interactions with the model’s outputs. You can imagine how powerful having one hub for all this activity can be for ML application and development speed. As a result, cloud vendors (Google, AWS, Azure) have developed similar all-in-one platforms, like Google Cloud AI Platform and AWS SageMaker. Startups like Paperspace Gradient, Neptune, and FloydHub also offer all-in-one platforms focused on deep learning. Determined AI, which focuses exclusively on the model development and training part of the lifecycle, is the rare open-source platform in this space. Domino Data Lab is a traditional ML-focused startup with an extensive feature set worth looking at. It’s natural to expect more MLOps (as this kind of tooling and infra is referred to) companies and vendors to build out their feature set and become platform-oriented; Weights and Biases is a good example of this.

                      +

                      In conclusion, take a look at the below table to compare a select number of MLOps platform vendors. Pricing is quite variable.

                      +

                      +

                      Staying up to date across all the tooling can be a real challenge, but check out FSDL’s Tooling Tuesdays on Twitter as a starting point!

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-7-notes-media/image1.png b/spring2021/lecture-7-notes-media/image1.png new file mode 100644 index 00000000..dbfcfc2e Binary files /dev/null and b/spring2021/lecture-7-notes-media/image1.png differ diff --git a/spring2021/lecture-7-notes-media/image10.png b/spring2021/lecture-7-notes-media/image10.png new file mode 100644 index 00000000..63d0eafd Binary files /dev/null and b/spring2021/lecture-7-notes-media/image10.png differ diff --git a/spring2021/lecture-7-notes-media/image11.png b/spring2021/lecture-7-notes-media/image11.png new file mode 100644 index 00000000..1a093da7 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image11.png differ diff --git a/spring2021/lecture-7-notes-media/image12.png b/spring2021/lecture-7-notes-media/image12.png new file mode 100644 index 00000000..f71cf09d Binary files /dev/null and b/spring2021/lecture-7-notes-media/image12.png differ diff --git a/spring2021/lecture-7-notes-media/image13.png b/spring2021/lecture-7-notes-media/image13.png new file mode 100644 index 00000000..84982042 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image13.png differ diff --git a/spring2021/lecture-7-notes-media/image14.png b/spring2021/lecture-7-notes-media/image14.png new file mode 100644 index 00000000..beb9053b Binary files /dev/null and b/spring2021/lecture-7-notes-media/image14.png differ diff --git a/spring2021/lecture-7-notes-media/image15.png b/spring2021/lecture-7-notes-media/image15.png new file mode 100644 index 00000000..f73a3c8a Binary files /dev/null and b/spring2021/lecture-7-notes-media/image15.png differ diff --git a/spring2021/lecture-7-notes-media/image2.png b/spring2021/lecture-7-notes-media/image2.png new file mode 100644 index 00000000..cb0028e9 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image2.png differ diff --git a/spring2021/lecture-7-notes-media/image3.png b/spring2021/lecture-7-notes-media/image3.png new file mode 100644 index 00000000..f610b36b Binary files /dev/null and b/spring2021/lecture-7-notes-media/image3.png differ diff --git a/spring2021/lecture-7-notes-media/image4.png b/spring2021/lecture-7-notes-media/image4.png new file mode 100644 index 00000000..1c38deb2 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image4.png differ diff --git a/spring2021/lecture-7-notes-media/image5.png b/spring2021/lecture-7-notes-media/image5.png new file mode 100644 index 00000000..4b830f47 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image5.png differ diff --git a/spring2021/lecture-7-notes-media/image6.png b/spring2021/lecture-7-notes-media/image6.png new file mode 100644 index 00000000..80cd67dd Binary files /dev/null and b/spring2021/lecture-7-notes-media/image6.png differ diff --git a/spring2021/lecture-7-notes-media/image7.png b/spring2021/lecture-7-notes-media/image7.png new file mode 100644 index 00000000..f9420eb8 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image7.png differ diff --git a/spring2021/lecture-7-notes-media/image8.png b/spring2021/lecture-7-notes-media/image8.png new file mode 100644 index 00000000..c2c3e2b5 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image8.png differ diff --git a/spring2021/lecture-7-notes-media/image9.png b/spring2021/lecture-7-notes-media/image9.png new file mode 100644 index 00000000..e1a61ab6 Binary files /dev/null and b/spring2021/lecture-7-notes-media/image9.png differ diff --git a/spring2021/lecture-7/index.html b/spring2021/lecture-7/index.html new file mode 100644 index 00000000..9d83b797 --- /dev/null +++ b/spring2021/lecture-7/index.html @@ -0,0 +1,3422 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 7: Troubleshooting Deep Neural Networks - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 7: Troubleshooting Deep Neural Networks

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Josh Tobin. +Notes transcribed by James Le and Vishnu Rachakonda.

                      +

                      In traditional software engineering, a bug usually leads to the program +crashing. While this is annoying for the user, it is critical for the +developer to inspect the errors to understand why. With deep learning, +we sometimes encounter errors, but all too often, the program crashes +without a clear reason why. While these issues can be debugged manually, +deep learning models most often fail because of poor output predictions. +What’s worse is that when the model performance is low, there is usually +no signal about why or when the models failed.

                      +

                      A common sentiment among practitioners is that they spend 80–90% of +time debugging and tuning the models and only 10–20% of time deriving +math equations and implementing things. This is confirmed by Andrej +Kaparthy, as seen in this +tweet.

                      +

                      1 - Why Is Deep Learning Troubleshooting Hard?

                      +

                      Suppose you are trying to reproduce a research paper result for your +work, but your results are worse. You might wonder why your model’s +performance is significantly worse than the paper that you’re trying to +reproduce?

                      +

                      +

                      Many different things can cause this:

                      +
                        +
                      • +

                        It can be implementation bugs. Most bugs in deep learning are + actually invisible.

                        +
                      • +
                      • +

                        Hyper-parameter choices can also cause your performance to + degrade. Deep learning models are very sensitive to + hyper-parameters. Even very subtle choices of learning rate and + weight initialization can make a big difference.

                        +
                      • +
                      • +

                        Performance can also be worse just because of data/model fit. + For example, you pre-train your model on ImageNet data and fit it + on self-driving car images, which are harder to learn.

                        +
                      • +
                      • +

                        Finally, poor model performance could be caused not by your model + but your dataset construction. Typical issues here include not + having enough examples, dealing with noisy labels and imbalanced + classes, splitting train and test set with different + distributions.

                        +
                      • +
                      +

                      2 - Strategy to Debug Neural Networks

                      +

                      The key idea of deep learning troubleshooting is: Since it is hard to +disambiguate errors, it’s best to start simple and gradually ramp up +complexity.

                      +

                      This lecture provides a decision tree for debugging deep learning +models and improving performance. This guide assumes that you already +have an initial test dataset, a single metric to improve, and target +performance based on human-level performance, published results, +previous baselines, etc.

                      +

                      +

                      3 - Start Simple

                      +

                      The first step is the troubleshooting workflow is starting simple.

                      +

                      Choose A Simple Architecture

                      +

                      There are a few things to consider when you want to start simple. The +first is how to choose a simple architecture. These are +architectures that are easy to implement and are likely to get you part +of the way towards solving your problem without introducing as many +bugs.

                      +

                      Architecture selection is one of the many intimidating parts of getting +into deep learning because there are tons of papers coming out +all-the-time and claiming to be state-of-the-art on some problems. They +get very complicated fast. In the limit, if you’re trying to get to +maximal performance, then architecture selection is challenging. But +when starting on a new problem, you can just solve a simple set of rules +that will allow you to pick an architecture that enables you to do a +decent job on the problem you’re working on.

                      +
                        +
                      • +

                        If your data looks like images, start with a LeNet-like + architecture and consider using something like ResNet as your + codebase gets more mature.

                        +
                      • +
                      • +

                        If your data looks like sequences, start with an LSTM with one + hidden layer and/or temporal/classical convolutions. Then, when + your problem gets more mature, you can move to an Attention-based + model or a WaveNet-like model.

                        +
                      • +
                      • +

                        For all other tasks, start with a fully-connected neural network + with one hidden layer and use more advanced networks later, + depending on the problem.

                        +
                      • +
                      +

                      +

                      In reality, many times, the input data contains multiple of those things +above. So how to deal with multiple input modalities into a neural +network? Here is the 3-step strategy that we recommend:

                      +
                        +
                      • +

                        First, map each of these modalities into a lower-dimensional feature + space. In the example above, the images are passed through a + ConvNet, and the words are passed through an LSTM.

                        +
                      • +
                      • +

                        Then we flatten the outputs of those networks to get a single vector + for each of the inputs that will go into the model. Then we + concatenate those inputs.

                        +
                      • +
                      • +

                        Finally, we pass them through some fully-connected layers to an + output.

                        +
                      • +
                      +

                      Use Sensible Defaults

                      +

                      After choosing a simple architecture, the next thing to do is to +select sensible hyper-parameter defaults to start with. Here are the +defaults that we recommend:

                      + +

                      Normalize Inputs

                      +

                      The next step is to normalize the input data, subtracting the mean +and dividing by the variance. Note that for images, it’s fine to scale +values to [0, 1] or [-0.5, 0.5] (for example, by dividing by 255).

                      +

                      Simplify The Problem

                      +

                      The final thing you should do is consider simplifying the problem +itself. If you have a complicated problem with massive data and tons of +classes to deal with, then you should consider:

                      +
                        +
                      • +

                        Working with a small training set around 10,000 examples.

                        +
                      • +
                      • +

                        Using a fixed number of objects, classes, input size, etc.

                        +
                      • +
                      • +

                        Creating a simpler synthetic training set like in research labs.

                        +
                      • +
                      +

                      This is important because (1) you will have reasonable confidence that +your model should be able to solve, and (2) your iteration speed will +increase.

                      +

                      The diagram below neatly summarizes how to start simple:

                      +

                      +

                      4 - Implement and Debug

                      +

                      To give you a preview, below are the five most common bugs in deep +learning models that we recognize:

                      +
                        +
                      • +

                        Incorrect shapes for the network tensors: This bug is a common + one and can fail silently. This happens many times because the + automatic differentiation systems in the deep learning framework + do silent broadcasting. Tensors become different shapes in the + network and can cause a lot of problems.

                        +
                      • +
                      • +

                        Pre-processing inputs incorrectly: For example, you forget to + normalize your inputs or apply too much input pre-processing + (over-normalization and excessive data augmentation).

                        +
                      • +
                      • +

                        Incorrect input to the model’s loss function: For example, you + use softmax outputs to a loss that expects logits.

                        +
                      • +
                      • +

                        Forgot to set up train mode for the network correctly: For + example, toggling train/evaluation mode or controlling batch norm + dependencies.

                        +
                      • +
                      • +

                        Numerical instability: For example, you get `inf` or `NaN` + as outputs. This bug often stems from using an exponent, a log, or + a division operation somewhere in the code.

                        +
                      • +
                      +

                      Here are three pieces of general advice for implementing your model:

                      +
                        +
                      • +

                        Start with a lightweight implementation. You want minimum + possible new lines of code for the 1st version of your model. The + rule of thumb is less than 200 lines. This doesn’t count tested + infrastructure components or TensorFlow/PyTorch code.

                        +
                      • +
                      • +

                        Use off-the-shelf components such as Keras if possible, since + most of the stuff in Keras works well out-of-the-box. If you have + to use TensorFlow, use the built-in functions, don’t do the math + yourself. This would help you avoid a lot of numerical instability + issues.

                        +
                      • +
                      • +

                        Build complicated data pipelines later. These are important for + large-scale ML systems, but you should not start with them because + data pipelines themselves can be a big source of bugs. Just start + with a dataset that you can load into memory.

                        +
                      • +
                      +

                      +

                      Get Your Model To Run

                      +

                      The first step of implementing bug-free deep learning models is +getting your model to run at all. There are a few things that can +prevent this from happening:

                      +
                        +
                      • +

                        Shape mismatch/casting issue: To address this type of problem, + you should step through your model creation and inference + step-by-step in a debugger, checking for correct shapes and data + types of your tensors.

                        +
                      • +
                      • +

                        Out-of-memory issues: This can be very difficult to debug. You + can scale back your memory-intensive operations one-by-one. For + example, if you create large matrices anywhere in your code, you + can reduce the size of their dimensions or cut your batch size in + half.

                        +
                      • +
                      • +

                        Other issues: You can simply Google it. Stack Overflow would be + great most of the time.

                        +
                      • +
                      +

                      Let’s zoom in on the process of stepping through model creation in a +debugger and talk about debuggers for deep learning code:

                      +
                        +
                      • +

                        In PyTorch, you can use + ipdb — which exports + functions to access the interactive + IPython debugger.

                        +
                      • +
                      • +

                        In TensorFlow, it’s trickier. TensorFlow separates the process of + creating the graph and executing operations in the graph. There + are three options you can try: (1) step through the graph creation + itself and inspect each tensor layer, (2) step into the training + loop and evaluate the tensor layers, or (3) use TensorFlow + Debugger + (tfdb), which does option 1 and 2 automatically.

                        +
                      • +
                      +

                      +

                      Overfit A Single Batch

                      +

                      After getting your model to run, the next thing you need to do is to +overfit a single batch of data. This is a heuristic that can catch +an absurd number of bugs. This really means that you want to drive your +training error arbitrarily close to 0.

                      +

                      There are a few things that can happen when you try to overfit a single +batch and it fails:

                      +
                        +
                      • +

                        Error goes up: Commonly, this is due to a flip sign somewhere in + the loss function/gradient.

                        +
                      • +
                      • +

                        Error explodes: This is usually a numerical issue but can also + be caused by a high learning rate.

                        +
                      • +
                      • +

                        Error oscillates: You can lower the learning rate and inspect + the data for shuffled labels or incorrect data augmentation.

                        +
                      • +
                      • +

                        Error plateaus: You can increase the learning rate and get rid + of regulation. Then you can inspect the loss function and the data + pipeline for correctness.

                        +
                      • +
                      +

                      +

                      Compare To A Known Result

                      +

                      Once your model overfits in a single batch, there can still be some +other issues that cause bugs. The last step here is to compare your +results to a known result. So what sort of known results are useful?

                      +
                        +
                      • +

                        The most useful results come from an official model implementation + evaluated on a similar dataset to yours. You can step through + the code in both models line-by-line and ensure your model has the + same output. You want to ensure that your model performance is up + to par with expectations.

                        +
                      • +
                      • +

                        If you can’t find an official implementation on a similar dataset, + you can compare your approach to results from an official model + implementation evaluated on a benchmark dataset. You most + definitely want to walk through the code line-by-line and ensure + you have the same output.

                        +
                      • +
                      • +

                        If there is no official implementation of your approach, you can + compare it to results from an unofficial model implementation. + You can review the code the same as before but with lower + confidence (because almost all the unofficial implementations on + GitHub have bugs).

                        +
                      • +
                      • +

                        Then, you can compare to results from a paper with no code (to + ensure that your performance is up to par with expectations), + results from your model on a benchmark dataset (to make sure + your model performs well in a simpler setting), and results from + a similar model on a similar dataset (to help you get a + general sense of what kind of performance can be expected).

                        +
                      • +
                      • +

                        An under-rated source of results comes from simple baselines + (for example, the average of outputs or linear regression), which + can help make sure that your model is learning anything at all.

                        +
                      • +
                      +

                      The diagram below neatly summarizes how to implement and debug deep +neural networks:

                      +

                      +

                      5 - Evaluate

                      +

                      Bias-Variance Decomposition

                      +

                      To evaluate models and prioritize the next steps in model development, +we will apply the bias-variance decomposition. The bias-variance +decomposition +is the fundamental model fitting tradeoff. In our application, let’s +talk more specifically about the formula for bias-variance tradeoff with +respect to the test error; this will help us apply the concept more +directly to our model’s performance. There are four terms in the formula +for test error:

                      +

                      Test error = irreducible error + bias + variance + validation +overfitting

                      +
                        +
                      1. +

                        Irreducible error is the baseline error you don’t expect your + model to do better. It can be estimated through strong baselines, + like human performance.

                        +
                      2. +
                      3. +

                        Avoidable bias, a measure of underfitting, is the difference + between our train error and irreducible error.

                        +
                      4. +
                      5. +

                        Variance, a measure of overfitting, is the difference between + validation error and training error.

                        +
                      6. +
                      7. +

                        Validation set overfitting is the difference between test error + and validation error.

                        +
                      8. +
                      +

                      Consider the chart of learning curves and errors below. Using the test +error formula for bias and variance, we can calculate each component of +test error and make decisions based on the value. For example, our +avoidable bias is rather low (only 2 points), while the variance is much +higher (5 points). With this knowledge, we should prioritize methods of +preventing overfitting, like regularization.

                      +

                      +

                      Distribution Shift

                      +

                      Clearly, the application of the bias-variance decomposition to the test +error has already helped prioritize our next steps for model +development. However, until now, we’ve assumed that the samples +(training, validation, testing) all come from the same distribution. +What if this isn’t the case? In practical ML situations, this +distribution shift often cars. In building self-driving cars, a +frequent occurrence might be training with samples from one distribution +(e.g., daytime driving video) but testing or inferring on samples from a +totally different distribution (e.g., night time driving).

                      +

                      A simple way of handling this wrinkle in our assumption is to create two +validation sets: one from the training distribution and one from the +test distribution. This can be helpful even with a very small testing +set. If we apply this, we can actually estimate our distribution shift, +which is the difference between testing validation error and testing +error. This is really useful for practical applications of ML! With this +new term, let’s update our test error formula of bias and variance:

                      +

                      Test error = irreducible error + bias + variance + distribution shift + +validation overfitting

                      +

                      6 - Improve Model and Data

                      +

                      Using the updated formula from the last section, we’ll be able to decide +on and prioritize the right next steps for each iteration of a model. In +particular, we’ll follow a specific process (shown below).

                      +

                      +

                      Step 1: Address Underfitting

                      +

                      We’ll start by addressing underfitting (i.e., reducing bias). The first +thing to try in this case is to make your model bigger (e.g., add +layers, more units per layer). Next, consider regularization, which can +prevent a tight fit to your data. Other options are error analysis, +choosing a different model architecture (e.g., something more state of +the art), tuning hyperparameters, or adding features. Some notes:

                      +
                        +
                      • +

                        Choosing different architectures, especially a SOTA one, can be very + helpful but is also risky. Bugs are easily introduced in the + implementation process.

                        +
                      • +
                      • +

                        Adding features is uncommon in the deep learning paradigm (vs. + traditional machine learning). We usually want the network to + learn features of its own accord. If all else fails, it can be + beneficial in a practical setting.

                        +
                      • +
                      +

                      +

                      Step 2: Address Overfitting

                      +

                      After addressing underfitting, move on to solving overfitting. +Similarly, there’s a recommended series of methods to try in order. +Starting with collecting training data (if possible) is the soundest way +to address overfitting, though it can be challenging in certain +applications. Next, tactical improvements like normalization, data +augmentation, and regularization can help. Following these steps, +traditional defaults like tuning hyperparameters, choosing a different +architecture, or error analysis are useful. Finally, if overfitting is +rather intractable, there’s a series of less recommended steps, such as +early stopping, removing features, and reducing model size. Early +stopping is a personal choice; the fast.ai community is a strong +proponent.

                      +

                      +

                      Step 3: Address Distribution Shift

                      +

                      After addressing underfitting and overfitting, If there’s a difference +between the error on our training validation set vs. our test validation +set, we need to address the error caused by the distribution shift. This +is a harder problem to solve, so there’s less in our toolkit to apply.

                      +

                      Start by looking manually at the errors in the test-validation set. +Compare the potential logic behind these errors to the performance in +the train-validation set, and use the errors to guide further data +collection. Essentially, reason about why your model may be suffering +from distribution shift error. This is the most principled way to deal +with distribution shift, though it’s the most challenging way +practically. If collecting more data to address these errors isn’t +possible, try synthesizing data. Additionally, you can try domain +adaptation.

                      +

                      +
                      Error Analysis
                      +

                      Manually evaluating errors to understand model performance is generally +a high-yield way of figuring out how to improve the model. +Systematically performing this error analysis process and +decomposing the error from different error types can help prioritize +model improvements. For example, in a self-driving car use case with +error types like hard-to-see pedestrians, reflections, and nighttime +scenes, decomposing the error contribution of each and where it occurs +(train-val vs. test-val) can give rise to a clear set of prioritized +action items. See the table for an example of how this error analysis +can be effectively structured.

                      +

                      +
                      Domain Adaptation
                      +

                      Domain adaptation is a class of techniques that train on a “source” +distribution and generalize to another “target” using only unlabeled +data or limited labeled data. You should use domain adaptation when +access to labeled data from the test distribution is limited, but access +to relatively similar data is plentiful.

                      +

                      There are a few different types of domain adaptation:

                      +
                        +
                      1. +

                        Supervised domain adaptation: In this case, we have limited data + from the target domain to adapt to. Some example applications of + the concept include fine-tuning a pre-trained model or adding + target data to a training set.

                        +
                      2. +
                      3. +

                        Unsupervised domain adaptation: In this case, we have lots of + unlabeled data from the target domain. Some techniques you might + see are CORAL, domain confusion, and CycleGAN.

                        +
                      4. +
                      +

                      Practically speaking, supervised domain adaptation can work really well! +Unsupervised domain adaptation has a little bit further to go.

                      +

                      Step 4: Rebalance datasets

                      +

                      If the test-validation set performance starts to look considerably +better than the test performance, you may have overfit the validation +set. This commonly occurs with small validation sets or lots of +hyperparameter training. If this occurs, resample the validation set +from the test distribution and get a fresh estimate of the performance.

                      +

                      7 - Tune Hyperparameters

                      +

                      One of the core challenges in hyperparameter optimization is very basic: +which hyperparameters should you tune? As we consider this +fundamental question, let’s keep the following in mind:

                      +
                        +
                      • +

                        Models are more sensitive to some hyperparameters than others. This + means we should focus our efforts on the more impactful + hyperparameters.

                        +
                      • +
                      • +

                        However, which hyperparameters are most important depends heavily on + our choice of model.

                        +
                      • +
                      • +

                        Certain rules of thumbs can help guide our initial thinking.

                        +
                      • +
                      • +

                        Sensitivity is always relative to default values; if you use good + defaults, you might start in a good place!

                        +
                      • +
                      +

                      See the following table for a ranked list of hyperparameters and their +impact on the model:

                      +

                      +

                      Techniques for Tuning Hyperparameter Optimization

                      +

                      Now that we know which hyperparameters make the most sense to tune +(using rules of thumb), let’s consider the various methods of actually +tuning them:

                      +
                        +
                      1. +

                        Manual Hyperparameter Optimization. Colloquially referred to as + Graduate Student Descent, this method works by taking a manual, + detailed look at your algorithm, building intuition, and + considering which hyperparameters would make the most difference. + After figuring out these parameters, you train, evaluate, and + guess a better hyperparameter value using your intuition for the + algorithm and intelligence. While it may seem archaic, this method + combines well with other methods (e.g., setting a range of values + for hyperparameters) and has the main benefit of reducing + computation time and cost if used skillfully. It can be + time-consuming and challenging, but it can be a good starting + point.

                        +
                      2. +
                      3. +

                        Grid Search. Imagine each of your parameters plotted against + each other on a grid, from which you uniformly sample values to + test. For each point, you run a training run and evaluate + performance. The advantages are that it’s very simple and can + often produce good results. However, it’s quite inefficient, as + you must run every combination of hyperparameters. It also often + requires prior knowledge about the hyperparameters since we must + manually set the range of values.

                        +
                      4. +
                      5. +

                        Random Search: This method is recommended over grid search. + Rather than sampling from the grid of values for the + hyperparameter evenly, we’ll choose n points sampled randomly + across the grid. Empirically, this method produces better results + than grid search. However, the results can be somewhat + uninterpretable, with unexpected values in certain hyperparameters + returned.

                        +
                      6. +
                      7. +

                        Coarse-to-fine Search: Rather than running entirely random runs, + we can gradually narrow in on the best hyperparameters through + this method. Initially, start by defining a very large range to + run a randomized search on. Within the pool of results, you can + find N best results and hone in on the hyperparameter values used + to generate those samples. As you iteratively perform this method, + you can get excellent performance. This doesn’t remove the manual + component, as you have to select which range to continuously + narrow your search to, but it’s perhaps the most popular method + available.

                        +
                      8. +
                      9. +

                        Bayesian Hyperparameter Optimization: This is a reasonably + sophisticated method, which you can read more about + here + and + here. + At a high level, start with a prior estimate of parameter + distributions. Subsequently, maintain a probabilistic model of the + relationship between hyperparameter values and model performance. + As you maintain this model, you toggle between training with + hyperparameter values that maximize the expected improvement (per + the model) and use training results to update the initial + probabilistic model and its expectations. This is a great, + hands-off, efficient method to choose hyperparameters. However, + these techniques can be quite challenging to implement from + scratch. As libraries and infrastructure mature, the integration + of these methods into training will become easier.

                        +
                      10. +
                      +

                      In summary, you should probably start with coarse-to-fine random +searches and move to Bayesian methods as your codebase matures and +you’re more certain of your model.

                      +

                      8 - Conclusion

                      +

                      To wrap up this lecture, deep learning troubleshooting and debugging is +really hard. It’s difficult to tell if you have a bug because there are +many possible sources for the same degradation in performance. +Furthermore, the results can be sensitive to small changes in +hyper-parameters and dataset makeup.

                      +

                      To train bug-free deep learning models, we need to treat building them +as an iterative process. If you skipped to the end, the following steps +can make this process easier and catch errors as early as possible:

                      +
                        +
                      • +

                        Start Simple: Choose the simplest model and data possible.

                        +
                      • +
                      • +

                        Implement and Debug: Once the model runs, overfit a single batch + and reproduce a known result.

                        +
                      • +
                      • +

                        Evaluate: Apply the bias-variance decomposition to decide what + to do next.

                        +
                      • +
                      • +

                        Tune Hyper-parameters: Use coarse-to-fine random searches to + tune the model’s hyper-parameters.

                        +
                      • +
                      • +

                        Improve Model and Data: Make your model bigger if your model + under-fits and add more data and/or regularization if your model + over-fits.

                        +
                      • +
                      +

                      Here are additional resources that you can go to learn more:

                      + + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-8-notes-media/image1.jpg b/spring2021/lecture-8-notes-media/image1.jpg new file mode 100644 index 00000000..607ab36c Binary files /dev/null and b/spring2021/lecture-8-notes-media/image1.jpg differ diff --git a/spring2021/lecture-8-notes-media/image1.png b/spring2021/lecture-8-notes-media/image1.png new file mode 100644 index 00000000..91c84a9b Binary files /dev/null and b/spring2021/lecture-8-notes-media/image1.png differ diff --git a/spring2021/lecture-8-notes-media/image10.jpg b/spring2021/lecture-8-notes-media/image10.jpg new file mode 100644 index 00000000..d04414ab Binary files /dev/null and b/spring2021/lecture-8-notes-media/image10.jpg differ diff --git a/spring2021/lecture-8-notes-media/image10.png b/spring2021/lecture-8-notes-media/image10.png new file mode 100644 index 00000000..63d76c45 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image10.png differ diff --git a/spring2021/lecture-8-notes-media/image11.jpg b/spring2021/lecture-8-notes-media/image11.jpg new file mode 100644 index 00000000..5db5d7eb Binary files /dev/null and b/spring2021/lecture-8-notes-media/image11.jpg differ diff --git a/spring2021/lecture-8-notes-media/image11.png b/spring2021/lecture-8-notes-media/image11.png new file mode 100644 index 00000000..1a093da7 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image11.png differ diff --git a/spring2021/lecture-8-notes-media/image12.png b/spring2021/lecture-8-notes-media/image12.png new file mode 100644 index 00000000..d239a55d Binary files /dev/null and b/spring2021/lecture-8-notes-media/image12.png differ diff --git a/spring2021/lecture-8-notes-media/image13.png b/spring2021/lecture-8-notes-media/image13.png new file mode 100644 index 00000000..b325f696 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image13.png differ diff --git a/spring2021/lecture-8-notes-media/image14.png b/spring2021/lecture-8-notes-media/image14.png new file mode 100644 index 00000000..aa2b7cf9 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image14.png differ diff --git a/spring2021/lecture-8-notes-media/image15.png b/spring2021/lecture-8-notes-media/image15.png new file mode 100644 index 00000000..b2c766d7 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image15.png differ diff --git a/spring2021/lecture-8-notes-media/image2.png b/spring2021/lecture-8-notes-media/image2.png new file mode 100644 index 00000000..e0e603ef Binary files /dev/null and b/spring2021/lecture-8-notes-media/image2.png differ diff --git a/spring2021/lecture-8-notes-media/image3.png b/spring2021/lecture-8-notes-media/image3.png new file mode 100644 index 00000000..cd4e5103 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image3.png differ diff --git a/spring2021/lecture-8-notes-media/image4.jpg b/spring2021/lecture-8-notes-media/image4.jpg new file mode 100644 index 00000000..fe670b39 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image4.jpg differ diff --git a/spring2021/lecture-8-notes-media/image4.png b/spring2021/lecture-8-notes-media/image4.png new file mode 100644 index 00000000..1c38deb2 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image4.png differ diff --git a/spring2021/lecture-8-notes-media/image5.png b/spring2021/lecture-8-notes-media/image5.png new file mode 100644 index 00000000..beca79bb Binary files /dev/null and b/spring2021/lecture-8-notes-media/image5.png differ diff --git a/spring2021/lecture-8-notes-media/image6.jpg b/spring2021/lecture-8-notes-media/image6.jpg new file mode 100644 index 00000000..083f9abd Binary files /dev/null and b/spring2021/lecture-8-notes-media/image6.jpg differ diff --git a/spring2021/lecture-8-notes-media/image6.png b/spring2021/lecture-8-notes-media/image6.png new file mode 100644 index 00000000..69c6cac9 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image6.png differ diff --git a/spring2021/lecture-8-notes-media/image7.png b/spring2021/lecture-8-notes-media/image7.png new file mode 100644 index 00000000..c8d252b6 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image7.png differ diff --git a/spring2021/lecture-8-notes-media/image8.png b/spring2021/lecture-8-notes-media/image8.png new file mode 100644 index 00000000..a1ae1b69 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image8.png differ diff --git a/spring2021/lecture-8-notes-media/image9.png b/spring2021/lecture-8-notes-media/image9.png new file mode 100644 index 00000000..6a36d9d5 Binary files /dev/null and b/spring2021/lecture-8-notes-media/image9.png differ diff --git a/spring2021/lecture-8/index.html b/spring2021/lecture-8/index.html new file mode 100644 index 00000000..3a67ccbc --- /dev/null +++ b/spring2021/lecture-8/index.html @@ -0,0 +1,3548 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 8: Data Management - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 8: Data Management

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Sergey Karayev. +Notes transcribed by James Le +and Vishnu Rachakonda.

                      +

                      One of the best data science articles written in 2019 is “Data +science is different +now” +by Vicki Boykis. Part of the +article is a collection of tweets from other data science and machine +learning practitioners.

                      +

                      +

                      1 - Data Management Overview

                      +

                      When we think about what data management for deep learning entails, +there might be many different data sources: images on S3, text files on +a file system, logs spread across different machines, and maybe even +records in a database. At some point, you need to get all of that data +over to a local filesystem next to GPUs. The way you will get data over +to that trainable format is different for every project and every +company. For instance:

                      +
                        +
                      • +

                        Maybe you train your images on ImageNet, and all the images are just +S3 URLs. Then, all you have to do is download them over to the +local filesystem.

                        +
                      • +
                      • +

                        Maybe you have a bunch of text files that you crawled yourself +somewhere. You want to use Spark to process them on a cluster and +Pandas data frame to analyze/select subsets that will be used in +the local filesystem.

                        +
                      • +
                      • +

                        Maybe you collect logs and records from your database into a data +lake/warehouse (like Snowflake). Then, you process that output and +convert them into a trainable format.

                        +
                      • +
                      +

                      There are countless possibilities that we are not going to cover +completely in this lecture, but here are the key points to remember:

                      +
                        +
                      • +

                        Let the data flow through you: You should spend 10x as much time +as you want to on exploring the dataset.

                        +
                      • +
                      • +

                        Data is the best way to improve your overall ML project +performance: Instead of trying new architectures or kicking off +the hyper-parameter search, adding more data and augmenting the +existing dataset will often be the best bang to your buck.

                        +
                      • +
                      • +

                        Keep It Simple Stupid: We will discuss complex pipelines and new +terms, but it’s important to not over-complicate things and make +data management a rocket science.

                        +
                      • +
                      +

                      2 - Data Sources

                      +

                      So, where do the training data come from? Most deep learning +applications require lots of labeled data (with exceptions in +applications of reinforcement learning, GANs, and GPT-3). There are +publicly available datasets that can serve as a starting point, but +there is no competitive advantage of using them. In fact, most companies +usually spend a lot of money and time labeling their own data.

                      +

                      Data Flywheel

                      +

                      +

                      Data flywheel is an exciting concept: if you can get your models in +front of the users, you can build your products in a mechanism that your +users contribute good data back to you and improve the model +predictions. This can enable rapid improvement after you get that v1 +model out into the real world.

                      +

                      Semi-Supervised Learning

                      +

                      Semi-supervised learning is a relatively recent learning technique where +the training data is autonomously (or automatically) labeled. It is +still supervised learning, but the datasets do not need to be manually +labeled by a human; but they can be labeled by finding and exploiting +the relations (or correlations) between different input signals (that +is, input coming from different sensor modalities).

                      +

                      A natural advantage and consequence of semi-supervised learning are that +this technique can be performed in an online fashion (given that data +can be gathered and labeled without human intervention) more easily +(with respect to, e.g., supervised learning), where models can be +updated or trained entirely from scratch. Therefore, semi-supervised +learning should also be well suited for changing environments, changing +data, and, in general, changing requirements.

                      +

                      +

                      For a text example, you can predict the future words from the past +words, predict the beginning of a sentence from the end of a sentence, +or predict the middle word of a sentence from the words surrounding it. +You can even examine whether two sentences occur in the same paragraph +in the same corpus of your training data. These are different ways to +formulate the problem, where you don’t need to label anything and simply +use the data to supervise itself.

                      +

                      This technique also applies to vision. Facebook AI recently released a +model called +SEER +trained on 1 billion random images from the Internet. Yet, SEER achieved +state-of-the-art accuracy on the ImageNet top-1 prediction task.

                      +

                      If you’re interested in learning more about semi-supervised learning, +check out:

                      + +

                      Data Augmentation

                      +

                      Recent advances in deep learning models have been largely attributed to +the quantity and diversity of data gathered in recent years. Data +augmentation is a strategy that enables practitioners to significantly +increase the diversity of data available for training models without +actually collecting new data. Data augmentation techniques such as +cropping, padding, and horizontal flipping are commonly used to train +large neural networks. In fact, they are mostly required for training +computer vision models. Both +Keras +and +fast.ai +provide functions that do this.

                      +

                      +

                      Data augmentation also applies to other types of data.

                      +
                        +
                      • +

                        For tabular data, you can delete some cells to simulate missing +data.

                        +
                      • +
                      • +

                        For text, there are no well-established techniques, but you can +replace words with synonyms and change the order of things.

                        +
                      • +
                      • +

                        For speech and video, you can change speed, insert a pause, mix +different sequences, and more.

                        +
                      • +
                      +

                      If you’re interested in learning more about data augmentation, check +out:

                      + +

                      Synthetic Data

                      +

                      Related to the concept of data augmentation is synthetic data, an +underrated idea that is almost always worth starting with. Synthetic +data is data that’s generated programmatically. For example, +photorealistic images of objects in arbitrary scenes can be rendered +using video game engines or audio generated by a speech synthesis model +from the known text. It’s not unlike traditional data augmentation, +where crops, flips, rotations, and distortions are used to increase the +variety of data that models have to learn from. Synthetically generated +data takes those same concepts even further.

                      +

                      Most of today’s synthetic data is visual. Tools and techniques developed +to create photorealistic graphics in movies and computer games are +repurposed to create the training data needed for machine learning. Not +only can these rendering engines produce arbitrary numbers of images, +but they can also produce annotations too. Bounding boxes, segmentation +masks, depth maps, and any other metadata is output right alongside +pictures, making it simple to build pipelines that produce their own +data.

                      +

                      Because samples are generated programmatically along with annotations, +synthetic datasets are far cheaper to produce than traditional ones. +That means we can create more data and iterate more often to produce +better results. Need to add another class to your model? No problem. +Need to add another key point to the annotation? Done. This is +especially useful for applications in driving and robotics.

                      +

                      +

                      If you’re interested in learning more about synthetic data, check out:

                      + +

                      3 - Data Storage

                      +

                      Data storage requirements for AI vary widely according to the +application and the source material. Datasets in intelligence, defense, +medical, science, and geology frequently combine petabyte-scale storage +volumes with individual file sizes in the gigabyte range. By contrast, +data used in areas such as supply chain analytics and fraud detection +are much smaller.

                      +

                      There are four building blocks in a data storage system:

                      +
                        +
                      1. +

                        The filesystem

                        +
                      2. +
                      3. +

                        The object storage

                        +
                      4. +
                      5. +

                        The database

                        +
                      6. +
                      7. +

                        The data lake or data warehouse

                        +
                      8. +
                      +
                      Filesystem
                      +

                      The filesystem is the foundational layer of storage.

                      +
                        +
                      • +

                        Its fundamental unit is a “file” — which can be text or binary, is +not versioned, and is easily overwritten.

                        +
                      • +
                      • +

                        A file system can be as simple as a locally mounted disk containing +all the files you need.

                        +
                      • +
                      • +

                        More advanced options include networked filesystems +(NFS), +which are accessible over the network by multiple machines, and +distributed file systems +(HDFS) +which are stored and accessed over multiple machines.

                        +
                      • +
                      +

                      +

                      The plots above display hard-drive speeds for SATA hard drive, SATA +SSD, and NVMe SSD.

                      +
                        +
                      • +

                        The left plot shows the sustained throughput in MBps (how much +information to copy a file): The latest iteration of hard drive +technology (NVMe) is 6-10x more powerful than older iterations.

                        +
                      • +
                      • +

                        The right plot shows the seek time in milliseconds (how long it +takes to go to a file on disk): The NVMe is 25-30x faster than the +old-school ones.

                        +
                      • +
                      +

                      What format should we store data in?

                      +
                        +
                      • +

                        For binary data (images, audios, videos), just files are enough. In +Tensorflow, you have the TFRecord format to batch binary files, +which does not seem to be necessary with the NVMe hard drives.

                        +
                      • +
                      • +

                        For large tabular and text data, you have two choices:

                        +
                          +
                        • +

                          HDF5 is powerful but bloated and declining.

                          +
                        • +
                        • +

                          Parquet is widespread and recommended.

                          +
                        • +
                        • +

                          Feather is an up-and-coming open-source option powered by Apache +Arrow.

                          +
                        • +
                        +
                      • +
                      • +

                        Both Tensorflow and PyTorch provide their native dataset class +interfaces +(tf.data and +PyTorch +DataLoader).

                        +
                      • +
                      +

                      Object Storage

                      +

                      Object storage is an API over the filesystem that allows users to use a +command on files (GET, PUT, DELETE) to a service without worrying where +they are actually stored.

                      +
                        +
                      • +

                        Its fundamental unit is an “object,”   which is usually binary +(images, sound files…).

                        +
                      • +
                      • +

                        Object storage can be built with data versioning and data redundancy +into the API.

                        +
                      • +
                      • +

                        It is not as fast as local files but fast enough within the cloud.

                        +
                      • +
                      +

                      Database

                      +

                      A database is a persistent, fast, scalable storage and retrieval of +structured data.

                      +
                        +
                      • +

                        Its fundamental unit is a “row” (unique IDs, references to other +rows, values in columns).

                        +
                      • +
                      • +

                        Databases are also known for online transaction processing +(OLTP). The mental model here is that everything is actually in +memory, but the software ensures that everything is logged to disk +and never lost.

                        +
                      • +
                      • +

                        Databases are not built for binary data, so you must store the +references (i.e., S3 URLs) instead.

                        +
                      • +
                      +

                      Here are our recommendations:

                      +
                        +
                      • +

                        PostgreSQL is the right choice +most of the time, thanks to the support of unstructured JSON.

                        +
                      • +
                      • +

                        SQLite is perfectly good for small +projects.

                        +
                      • +
                      • +

                        “NoSQL” was a big craze in the 2010s (like +MongoDB). However, they are not +as fast as the relational database and also have consistency +issues frequently.

                        +
                      • +
                      • +

                        Redis is handy when you need a simple +key-value store.

                        +
                      • +
                      +

                      Data Warehouse

                      +

                      A data warehouse is a structured aggregation of data for analysis, known +as online analytical processing (OLAP).

                      +

                      +

                      Another acronym that you might have heard of is ETL (Extract, +Transform, Load). The idea here is to extract data from data sources, +transform the data into a common schema, and load the schema into the +data warehouse. You can load the subset from the warehouse that you need +and generate reports or run analytical queries. Well-known enterprise +options in the market are Google +BigQuery, Amazon +Redshift, and +Snowflake.

                      +

                      SQL and DataFrames

                      +

                      Most data solutions use SQL as the interface to the data, except for +some (like Databricks) that use DataFrames. SQL is the standard +interface for structured data. But in the Python ecosystem, Pandas is +the main DataFrame. Our advice is to become fluent in both.

                      +

                      Data Lake

                      +

                      A data lake is the unstructured aggregation of data from multiple +sources (databases, logs, expensive data transformations). It operates +under the concept of ELT (Extract, Load, Transform) by dumping +everything in the lake and transforming the data for specific needs +later.

                      +

                      +

                      Data “Lakehouse”

                      +

                      The current trend in the field is to combine data warehouses and data +lakes in the same suite. The Databricks Lakehouse +Platform is both a +warehouse and a lake, operated as an open-source project called +Delta Lake. You can store both structured +and unstructured data in the platform and use them for analytics +workloads and machine learning engines.

                      +

                      What Goes Where?

                      +
                        +
                      • +

                        Binary data (images, sound files, compressed texts) are stored +as objects.

                        +
                      • +
                      • +

                        Metadata (labels, user activity) is stored in a database.

                        +
                      • +
                      • +

                        If we need features that are not obtainable from the database +(logs), we would want to set up a data lake and a process +to aggregate the data required.

                        +
                      • +
                      • +

                        At training time, we need to copy the necessary data to the +filesystem on a fast drive.

                        +
                      • +
                      +

                      +

                      A lot is going on within the data management tooling and infrastructure. +We recommend looking at a16z’s “Emerging Architectures For Modern +Data +Infrastructure” +article to get a broad look into this ecosystem.

                      +

                      A highly recommended resource is Martin Kleppmann’s book “Designing +Data-Intensive Applications,”   which +provides excellent coverage of tools and approaches to build reliable, +scalable, and maintainable data storage systems.

                      +

                      4 - Data Processing

                      +

                      Data Dependencies

                      +

                      Let’s look at a motivational example of training a photo popularity +predictor every night. For each photo, the training data must include +these components:

                      +
                        +
                      • +

                        Metadata (such as posting time, title, location) that is in the +database.

                        +
                      • +
                      • +

                        Some features of the user (such as how many times they logged in +today) that need to be computed from logs.

                        +
                      • +
                      • +

                        Outputs of photo classifiers (such as content, style) that can +be obtained after running the classifiers.

                        +
                      • +
                      +

                      The idea is that we have different sources of data, and they have +different dependencies. The big hurdle here is that some tasks can’t be +started until other tasks are finished. Finishing a task should +“kick-off” its dependencies.

                      +

                      The simplest thing we can do is a “Makefile” to specify what action(s) +depend on. But here are some limitations to this approach:

                      +
                        +
                      • +

                        What if re-computation needs to depend on content, not on a date?

                        +
                      • +
                      • +

                        What if the dependencies are not files but disparate programs and +databases?

                        +
                      • +
                      • +

                        What if the work needs to be spread over multiple machines?

                        +
                      • +
                      • +

                        What if many dependency graphs are executing all at once, with +shared dependencies?

                        +
                      • +
                      +

                      MapReduce

                      +

                      +

                      The old-school big data solutions to this are +Hadoop and Apache +Spark. These are MapReduce +implementations, where you launch different tasks that each take a bit +of the data (Map) and reduce their outputs into a single output +(Reduce). Both Hadoop and Spark can run data processing operations and +simple ML models on commodity hardware, with tricks to speed things up.

                      +

                      In the modern environment, you can’t run an ML model (in PyTorch or +TensorFlow) as part of running a Spark job (unless that model itself is +programmed in Spark). That’s when you need a workflow management system +like Apache Airflow.

                      +

                      DAG

                      +

                      In Airflow, a workflow is defined as a collection of tasks with +directional dependencies, basically a directed acyclic graph (DAG). +Each node in the graph is a task, and the edges define dependencies +among the tasks. Tasks belong to two categories: (1) operators that +execute some operation and (2) sensors that check for the state of a +process or a data structure.

                      +

                      +

                      The main components of Airflow include: (1) a metadata database that +stores the state of tasks and workflows, (2) a scheduler that uses the +DAGs definitions together with the state of tasks in the metadata +database to decide what needs to be executed, and (3) an executor that +determines which worker will execute each task.

                      +

                      Besides Airflow, here are other notable solutions:

                      +
                        +
                      • +

                        Apache Beam: The TensorFlow team +uses Apache Beam to generate big +datasets +and run those processing steps on Google Cloud Dataflow (a cloud +orchestrator).

                        +
                      • +
                      • +

                        Prefect: A similar idea to +Airflow, Prefect is a Python framework that makes it easy to +combine tasks into workflows, then deploy, schedule, and monitor +their execution through the Prefect UI or API.

                        +
                      • +
                      • +

                        dbt: dbt provides this data +processing ability in SQL (called “analytics engineering.”)

                        +
                      • +
                      • +

                        Dagster: Dagster is another data +orchestrator for ML, analytics, and ETL. You can test locally and +run anywhere with a unified view of data pipelines and assets.

                        +
                      • +
                      +

                      5 - Feature Store

                      +

                      Feature stores were first popularized by the ML team at Uber as part of +their +Michelangelo +platform. Traditionally, ML systems are divided into two portions, +offline processing and online processing.

                      +
                        +
                      • +

                        For the initial work of modeling, data that is generally static, +perhaps stored in a data lake. Using some preprocessing methods +(usually in SQL or Spark), data, which could be logfiles, +requests, etc., are converted into features used to develop and +train the model. The end result of this process is a model trained +on a static sample of the data. This is an offline process.

                        +
                      • +
                      • +

                        In contrast, the process of performing inference (e.g., Uber’s need +to return ride prices in real-time) often works with real-time +data in an online process fashion. From a technology +standpoint, whereas the offline use case might involve a data lake +and Spark/SQL, the online processing use case involves +technologies like Kafka and Cassandra that support speedier +processing of creating or accessing the features required to +perform inference.

                        +
                      • +
                      +

                      +

                      This difference in how features need to be created and accessed is a +natural place for bugs to crop up. Harmonization of the online and +offline processes would reduce bugs, so the Uber team, amongst others, +introduced the concept of features stores to do just that. Members of +the Uber team developed Tecton, a feature +store company, which is one option to implement this system. An +open-source alternative is Feast. To +summarize, Tecton offers a handy definition of what a feature store is: +“an ML-specific data system that runs data pipelines that transform raw +data into feature values, stores and manages the feature data itself, +and serves feature data consistently for training and inference +purposes.”

                      +

                      A word of caution: don’t over-engineer your system according to what +others are doing. It’s easy to wrap yourself up in adopting many tools +and systems that aren’t as optimal as their publicity may make them +seem. Work with the tools you have first! For an interesting example of +this, look at how “command-line tools can be 235x faster than your +Hadoop +cluster”.

                      +

                      6 - Data Exploration

                      +

                      The objective of data exploration is to understand and visualize the +nature of the data you’re modeling.

                      +
                        +
                      • +

                        Pandas is the Python workhorse +of data visualization. It’s highly recommended to be familiar with +it.

                        +
                      • +
                      • +

                        Dask is an alternative that can speed up +data processing for large datasets that Pandas cannot handle +through parallelization.

                        +
                      • +
                      • +

                        Similarly, RAPIDS speeds up large +dataset processing, though it does through the use of GPUs.

                        +
                      • +
                      +

                      7 - Data Labeling

                      +

                      Effective data labeling is a core ingredient of production machine +learning systems. Most data labeling platforms have a standard set of +features: the ability to generate bounding boxes, segmentations, key +points, class assignments, etc. The crucial objective is agreeing on +what makes a good annotation and training annotators accordingly. To +avoid annotator error cropping up, write clear guidelines that clarify +rules for edge cases and high-quality annotations. One way to acquire +the material needed to write such a guide is to start by annotating +yourself. As you generate labels, ensure the quality of the annotations +holds up across the annotator base. Some participants will be more +reliable than others.

                      +

                      +

                      To develop an annotator base, there are a few options.

                      +

                      Sources of Labor

                      +
                        +
                      • +

                        One option is to hire your own annotators, which can help with +the speed and quality of annotations. This, however, can be +expensive and difficult to scale.

                        +
                      • +
                      • +

                        Another option is to crowdsource labels via a platform like +Amazon Mechanical Turk, which is fast and cheap to set up, but for +which the quality can be poorer.

                        +
                      • +
                      • +

                        …or full-service data labeling companies.

                        +
                      • +
                      +

                      Service Companies

                      +

                      There are entire service companies that focus on data labeling that you +can hire. Hiring such a company makes a great deal of sense, considering +the time, labor, and software investment needed to label well at scale. +To figure out the best data labeling company, start by annotating some +gold standard data yourself. Then, contact and evaluate several +companies on their value and a sample labeling task. Some companies in +this space are +FigureEight, +Scale.ai, +Labelbox, and +Supervisely.

                      +

                      Software

                      +

                      If the costs of a full-service data labeling company are prohibitive, +pure-play labeling software can be an option.

                      +
                        +
                      • +

                        Label Studio is a friendly +open-source platform for this. New concepts to make labeling more +strategic and efficient are coming to the fore.

                        +
                      • +
                      • +

                        Aquarium helps you +explore your data extensively and map the appropriate labeling +strategy for classes that may be less prevalent or performant.

                        +
                      • +
                      • +

                        Snorkel.ai offers a platform that +incorporates weak supervision, which automatically labels data +points based on heuristics and human feedback.

                        +
                      • +
                      +

                      In summary, if you can afford not to label, don’t; get a full-service +company to take care of it. Failing that, try to use existing software +and a part-time annotator base work (in lieu of a crowdsourced +workforce).

                      +

                      8 - Data Versioning

                      +

                      Data versioning is important because machine learning models are part +code and part data. If the data isn’t versioned, the system isn’t fully +versioned! There are four levels to data versioning, which is similar to +code versioning:

                      +

                      Level 0: No versioning.

                      +
                        +
                      • +

                        All data lives on a filesystem, in S3, and/or in a database.

                        +
                      • +
                      • +

                        The problem arises most acutely in this paradigm, as deployed ML +systems (whose code may be versioned) can quickly become divorced +from their corresponding data.

                        +
                      • +
                      • +

                        Furthermore, reverting to older versions will be challenging.

                        +
                      • +
                      +

                      Level 1: Storing a snapshot of everything at training time.

                      +
                        +
                      • +

                        This works and can help you revert, but it’s very hacky.

                        +
                      • +
                      • +

                        Rather than doing this entire process manually, let’s try to version +automatically.

                        +
                      • +
                      +

                      Level 2: Versioned as a mix of assets and code.

                      +
                        +
                      • +

                        You store the large files with unique IDs in S3, with corresponding +reference JSON versioned with code.

                        +
                      • +
                      • +

                        You should avoid storing the data directly in the repository, as the +metadata itself can get pretty large. Using git-lfs lets you store +them just as easily as code.

                        +
                      • +
                      • +

                        The git signature + of the raw data file fully defines a model’s +data and code.

                        +
                      • +
                      +

                      Level 3: Specialized solutions for version data.

                      +
                        +
                      • +

                        You should avoid them until you can identify their unique value add +to your project.

                        +
                      • +
                      • +

                        Some options here are DVC are +Pachyderm. DVC has +a Git-like workflow worth taking a closer look at. +Dolt versions databases, if +that’s your need.

                        +
                      • +
                      +

                      +

                      9 - Data Privacy

                      +

                      +

                      Increasingly, unfettered access to data for machine learning is less +desirable and prevalent. This is especially true in regulated industries +like healthcare and finance. To address such challenges, researchers are +developing new data privacy techniques.

                      +
                        +
                      • +

                        Federated +learning +trains a global model on several local devices without ever +acquiring global access to the data. Federated learning is still +research-use only due to these issues: (1) sending model updates +can be expensive, (2) the depth of anonymization is not clear, +and (3) system heterogeneity when it comes to training is +unacceptably high.

                        +
                      • +
                      • +

                        Another research area is differential +privacy, +which tries to aggregate data in ways that prevent identification. +Finally, learning on encrypted data has potential. Most data +privacy efforts are research-focused, as the tooling is not yet +mature.

                        +
                      • +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-9-notes-media/image1.png b/spring2021/lecture-9-notes-media/image1.png new file mode 100644 index 00000000..1e86627f Binary files /dev/null and b/spring2021/lecture-9-notes-media/image1.png differ diff --git a/spring2021/lecture-9-notes-media/image2.png b/spring2021/lecture-9-notes-media/image2.png new file mode 100644 index 00000000..034e27d0 Binary files /dev/null and b/spring2021/lecture-9-notes-media/image2.png differ diff --git a/spring2021/lecture-9-notes-media/image3.png b/spring2021/lecture-9-notes-media/image3.png new file mode 100644 index 00000000..1275752a Binary files /dev/null and b/spring2021/lecture-9-notes-media/image3.png differ diff --git a/spring2021/lecture-9-notes-media/image4.png b/spring2021/lecture-9-notes-media/image4.png new file mode 100644 index 00000000..ba1948ac Binary files /dev/null and b/spring2021/lecture-9-notes-media/image4.png differ diff --git a/spring2021/lecture-9-notes-media/image5.png b/spring2021/lecture-9-notes-media/image5.png new file mode 100644 index 00000000..1c6b3fcf Binary files /dev/null and b/spring2021/lecture-9-notes-media/image5.png differ diff --git a/spring2021/lecture-9-notes-media/image6.png b/spring2021/lecture-9-notes-media/image6.png new file mode 100644 index 00000000..1e83651a Binary files /dev/null and b/spring2021/lecture-9-notes-media/image6.png differ diff --git a/spring2021/lecture-9-notes-media/image7.png b/spring2021/lecture-9-notes-media/image7.png new file mode 100644 index 00000000..46fbef0e Binary files /dev/null and b/spring2021/lecture-9-notes-media/image7.png differ diff --git a/spring2021/lecture-9/index.html b/spring2021/lecture-9/index.html new file mode 100644 index 00000000..6af78b7b --- /dev/null +++ b/spring2021/lecture-9/index.html @@ -0,0 +1,3179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Lecture 9: AI Ethics - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Lecture 9: AI Ethics

                      +

                      Video

                      + + +

                      Slides

                      + + +

                      Download slides as PDF

                      +

                      Notes

                      +

                      Lecture by Sergey Karayev. +Notes transcribed by James Le +and Vishnu Rachakonda.

                      +

                      A preamble: Ethics is a vast subject spanning many disciplines and +addressing many real different problems. As ML practitioners, we need to +have a student mindset and do not assume we have the answers because +these are not easy problems.

                      +

                      1 - What is Ethics?

                      +

                      Let’s start with the definition of ethics:

                      +
                        +
                      • +

                        Ethics are not feelings because your feelings might mislead you.

                        +
                      • +
                      • +

                        Ethics are not laws because ethics can supersede laws.

                        +
                      • +
                      • +

                        Ethics are not societal beliefs because even an immoral society has +its set of ethics.

                        +
                      • +
                      +

                      Ethical Theories

                      +

                      Kevin Binz put together a tour of ethical +theories, +including:

                      +
                        +
                      • +

                        The divine command theory states that a behavior is moral if the +divine commands it. This theory might be accurate, but philosophy +doesn’t engage with it.

                        +
                      • +
                      • +

                        The virtue ethics theory states that a behavior is moral if it +upholds a person’s virtues (bravery, generosity, love, etc.). This +theory is apparently robust to philosophical inquiry, but there is +increasing evidence that virtues are not persistent across a +person’s life and somewhat illusory.

                        +
                      • +
                      • +

                        The deontology (duty-based) theory states that a behavior is +moral if it satisfies the categorical imperative (i.e., don’t lie, +don’t kill). This theory might lead to counter-intuitive moral +decisions in many situations and has unacceptable inflexibility to +many people.

                        +
                      • +
                      • +

                        The utilitarian theory states that a behavior is moral if it +brings the most good to the most people. But of course, how do we +measure utility?

                        +
                      • +
                      +

                      There does not seem to be a clear winner among professional +philosophers. From this +survey, there appears to +be an even split between virtue, deontology, and utilitarianism.

                      +

                      The Trolley Problem

                      +

                      The “trolley problem” is often used to gain intuition about a person’s +ethics by presenting to him/her a moral dilemma. The classic dilemma is +that: You see a trolley that is about to run over five people. But you +can divert it to run over only one person. Would you do it? It +actually leads to a lot of good memes. 🤣

                      +

                      +

                      Another prominent ethical theory is John Rawl’s theory of +justice. Rawls +argued that equal distribution of resources should be the desirable +state of nature instead of following utilitarian philosophies. A Theory +of Justice holds that every individual has an equal right to basic +liberties. They should have the right to opportunities and an equal +chance as other individuals of similar ability.

                      +

                      When ethics are applied to technology, it’s essential to understand that +they are not static and change with technological progress. Some +examples include the industrial revolution, the right to Internet +access, birth control, surrogate pregnancy, embryo selection, artificial +womb, lab-grown meat, and much more. An excellent book to explore is +Juan Enriquez’s “Right/Wrong: How Technology Transforms Our +Ethics.”

                      +

                      2 - Long-Term AI Ethical Problems

                      +

                      Autonomous Weapons

                      +

                      The first example that came to a lot of people’s minds is autonomous +weapons. It might be tempting to dismiss it as far-fetched and +unrealistic at this time. But as the saying goes, “the future is already +here, just not evenly distributed”:

                      + +

                      Lost Human Labor

                      +

                      Replacing human labor is another concern that has been creeping upon +us. With the pandemic, you probably saw many articles saying that +millions of people have lost jobs and probably will never get them back +(replaced by AI). This could be both good and bad. 🤔

                      +

                      It’s bad if there are no social safety net and no other jobs for the +unemployed. It’s good because there is a megatrend of the +demographic inversion. As the world’s population tops out and baby booms +vary across regions, the economy can’t function as currently designed. +Therefore, we need labor from somewhere. Rodney Brooks, a roboticist +from MIT and the founder of iRobot, advocates for having robots in +order to have a functioning +economy +in the next few decades.

                      +

                      An interesting spin on this worry is that AI is not necessarily +replacing human labor but controlling human labor. This article from +The +Verge +provides more details about working in conditions in warehouses, call +centers, and other sectors.

                      +

                      If you want to go down the rabbit hole, check out this series “Manna +- Two Views of Humanity’s Future” +from Marshall Brain.

                      +

                      Human Extinction

                      +

                      The final worry is that if AI is superintelligent, then it is capable of +replacing humans entirely.

                      +

                      The Alignment Problem

                      +

                      What’s common in all these long-term problems is the alignment +problem. This notion is often expressed by the parable of the +“paperclip +maximizer” - +given the goal of producing paperclips, an AGI will eventually turn +every atom space into paperclips. This is an old lesson about how to +establish and communicate our goals and values to technologies +precisely.

                      +

                      The guiding principle to build safe AI is that the AI systems we build +need to be aligned with our goals and values. This is a deep topic and +active research area in many places (including CHAI at +Berkeley). As a matter of fact, this +alignment lens is useful for near-term problems as well, as discussed in +the rest of the lecture.

                      +

                      +

                      3 - Hiring

                      +

                      Let’s say we are building an ML model to predict hiring decisions given +a resume (inspired by this Reuters +article +about Amazon’s hiring algorithm).

                      +

                      What should the data contain? Should it be the hiring decision that +was made? Or should it be the eventual job performance given the person +that was hired?

                      +

                      The data comes from the world, which is known to be biased in many +ways: the hiring pipeline (not enough women educated for a software +engineering job), the hiring decisions (employers intentionally or +unintentionally select people that match some prejudice), the +performance ratings (people get promoted not because they are good of +their job, but because they match other expectations of the promoter).

                      +

                      Because the world is biased, the data will be biased no matter how we +structure the data. Therefore, the model trained on that data will be +biased.

                      +

                      The model will be used to aid or make an action: sourcing candidates, +double-checking human decisions, or making the actual hiring decisions? +In the last case, that action will amplify existing biases.

                      +

                      Amplifying existing biases is not aligned with our goals and +values!😠

                      +

                      +

                      4 - Fairness

                      +

                      COMPAS

                      +

                      Let’s look at a case study about +COMPAS - +Correctional Offender Management Profiling for Alternative Sanctions +system to discuss fairness.

                      +
                        +
                      • +

                        The goal of this system is to predict recidivism (committing +another crime), such that judges can consult a 1-10 score in +pre-trial sentencing decisions.

                        +
                      • +
                      • +

                        The motivation of this system is to be less biased than humans +because the criminal justice system is notoriously biased against +certain races.

                        +
                      • +
                      • +

                        The solution of this system is to (1) gather relevant data, (2) +exclude protected class attributes (race, gender, age, etc.), (3) +train the model by ensuring that the model’s score corresponds to +the same probability of recidivism across all demographic groups.

                        +
                      • +
                      +

                      And yet, this famous ProPublica +report +exposes the bias of this system against blacks.

                      +

                      Fairness Definitions (From Aravind Narayanan’s Lecture)

                      +

                      There are a bunch of fairness definitions. The first one concerns bias. +We often mean statistical bias in machine learning - the difference +between the model’s expected value and the true value.

                      +
                        +
                      • +

                        In this sense, the COMPAS scores are not biased with respect to +re-arrest. This is an important caveat; because we only have data +for arrests, not crimes committed. There may well be bias in +arrests (the data-generating process).

                        +
                      • +
                      • +

                        Even if COMPAS is free of statistical bias, is it an adequate +fairness criterion? Is this criterion aligned with human +values?

                        +
                      • +
                      +

                      +

                      Taking a step back and look at the classic binary classification problem +setup, we have the confusion matrix as seen above. The interesting +question to ask is what do different stakeholders want from the +classifier?

                      +
                        +
                      • +

                        The decision-maker (the judge or the prosecutor) asks: “Of those +that I have labeled high risk, how many recidivated?” This +corresponds to the model’s predictive value = TP / (TP + FP).

                        +
                      • +
                      • +

                        The defendant asks: “What is the probability I’ll be incorrectly +classified as high risk?” This corresponds to the model’s false +positive rate = FP / (FP + FN).

                        +
                      • +
                      • +

                        The society at large might care about: “Is the selected set +demographically balanced?” This could be demographic parity, which +leads to the definition of group fairness (“Do outcomes differ +between groups, which we have no reason to believe are actually +different?”).

                        +
                      • +
                      +

                      A lot of these group fairness metrics have natural motivations, so +there’s not a single correct fairness definition. They depend on the +politics of the situation.

                      +

                      Let’s forget about demographic parity and only pick the two most +important metrics (false-positive rate and false-negative rate) while +allowing the model to use protected class attributes. We fail the +individual fairness definition, which uses a single threshold for +the sentencing decision or the pre-sentencing release decision.

                      +

                      Even if we pick one metric to optimize for, we still sacrifice some +utility (providing public safety or releasing too few defendants).

                      +

                      To build more intuition, you should play around with this +interactive demo on attacking discrimination with smarter +ML +from Google Research.

                      +

                      Finally, ML can be very good at finding patterns that maybe humans can’t +find. For instance, your ZIP code and age might be highly correlated +with your race. That means the model can always pick up from a protected +class attribute from other attributes. Read this paper on Equality +of Opportunity in Supervised +Learning for more detail.

                      +

                      Tradeoffs

                      +

                      There are tradeoffs between different measures of group fairness, +between the definitions of group fairness and individual fairness, and +between the notions of fairness and utility. In fact, these tradeoffs +are not specific to machine learning. They apply to human decision +making too. There is also a tension between disparate treatment and +disparate impact, which is another deep subject.

                      +

                      Seeing The Water

                      +

                      +

                      In order to see the water, it would be noteworthy to think about the +differences between environmental equity and environmental +justice:

                      +
                        +
                      • +

                        Equality: The assumption is that everyone benefits from the same +supports. This is equal treatment.

                        +
                      • +
                      • +

                        Equity: Everyone gets the support they need (“affirmative +action”), thus producing equity.

                        +
                      • +
                      • +

                        Justice: All parties are supported equally because the cause of +the inequity was addressed. The systematic barrier has been +removed.

                        +
                      • +
                      +

                      The justice mindset is valuable to have. As computer scientists, we have +very literal minds and argue for the rationality of our choices. But +taking a step back and seeing the whole situation would be even more +crucial.

                      +

                      5 - Representation

                      +

                      The Problem

                      +

                      Watch this simple +video: a +hand sanitizer dispenser that doesn’t recognize racially diverse hands. +It’s a small example but illustrates a big problem: a lack of +attention to diverse representation in the development of technology +products. This occurs across fields, such as drug development, +photography, etc. As pointed out by Timnit Gebru in this New York +Times +article, +the exclusion of people from certain backgrounds poses a serious +long-term threat to the viability of ML systems.

                      +

                      One way to address this challenge head-on is to focus on the inclusion +of people from all backgrounds. Groups like Black in +AI, Women in Machine +Learning, and Latinx in +AI play a big role in building +communities of underrepresented people and inviting them into the AI/ML +industry. Another is to deliberately ensure products reflect inclusive +values. For example, Google Images now yields a diverse set of images +for the search term “CEO” whereas it used to return entirely white, +middle-aged men.

                      +

                      Word Embeddings

                      +

                      A particularly relevant example of bias in machine learning is the +underlying bias in the Word2Vec model. +Word2Vec +introduced vector math for word embeddings and is frequently used for +NLP applications. The original model was trained on a large corpus, and +the weights were open-sourced. As these weights were examined, +underlying bias in the word logic was discovered. Terms like “doctor” +and “programmer” were associated with men, while “homemaker” and “nurse” +were associated with women. Translating our existing biases like these +into the ML domain is undesirable, to say the least! 😩

                      +

                      +

                      One potential solution to address this problem is to de-bias at +training time with carefully screened data. With newer models like +GPT-3 that are trained on massive swathes of data, this can be hard to +do in practice. Bender and Gebru advise in a 2021 +paper +to reduce the dependence on large, unfiltered datasets and more +carefully document the data-generating process. Alternatively, you can +alert the user proactively of potential bias. Addressing this problem of +bias in language models is an open problem.

                      +

                      Seeing The Water

                      +

                      Part of the challenge lies in agreeing on whether the model should +learn about the world as it is in the data or learn about the world in a +more idealistic manner. This is application-specific. A model +recognizing hate speech on Facebook should probably learn about the +world as it is, or a model interacting with humans’ conversations should +adhere to proper ideals. Of course, this begs the question of who +decides what ideals are desirable and suitable for a model to follow. +Consider these questions as you build models for various applications.

                      +

                      Ultimately, these challenges in machine learning systems development are +rooted in ethics. Face recognition is a boundary-breaking area +that has been grappling with ethical +concerns. +Importantly, face recognition illustrates how technology can impact +ethics and change standards. Is the loss of privacy associated with face +recognition desirable? Relatedly, are face recognition systems +performing well across groups? The question of performance should +generally follow ethics to avoid distracting from the fundamental +ethical issues (e.g., civil rights, privacy, etc.).

                      +

                      6 - Best Practices

                      +

                      A recent survey of ML +practitioners found these to be +the top challenges in ensuring fairness that they face:

                      +
                        +
                      1. +

                        Receiving support in fairness-aware data collection and curation

                        +
                      2. +
                      3. +

                        Overcoming team’s blind spots

                        +
                      4. +
                      5. +

                        Implementing more proactive fairness auditing processes

                        +
                      6. +
                      7. +

                        Auditing complex ML systems

                        +
                      8. +
                      9. +

                        Deciding how to address particular instances of unfairness

                        +
                      10. +
                      11. +

                        Addressing biases in the humans embedded throughout the ML +development pipeline

                        +
                      12. +
                      +

                      Suggestions

                      +

                      Rachel Thomas, the co-creator of Fast.ai, has some great ideas on +how to confront fairness issues +proactively:

                      +
                        +
                      1. +

                        Perform ethical risk sweeping. Akin to cybersecurity +penetration testing, where engineers intentionally try to find +faults, you can try to engage in regular fairness checks on behalf +of different stakeholders.

                        +
                      2. +
                      3. +

                        Expand the ethical circle. Try to consider different +perspectives than yours regularly, and invite such people into +your decision-making “circle” to ensure that systems do not lead +to unfair outcomes.

                        +
                      4. +
                      5. +

                        Think about worst-case scenarios. What incentives may crop up +for people to engage in unethical behavior? For example, the +upvote-downvote system and recommendations on Reddit can cause +toxic behavior. Think about such incentives and requisite +safeguards in advance.

                        +
                      6. +
                      7. +

                        Close the loop! You have to put in place a process to keep +improving, as fairness is not a static test (just like raw +performance).

                        +
                      8. +
                      +

                      One powerful tool, proposed by Gebru and +Mitchell in 2018, is adopting +“model +cards.” For every +ML model, make a simple page that discusses the expectations (i.e., +input/output), tradeoffs, performance, and known limitations. Engaging +in this documentation exercise allows for teams to confront fairness +issues head-on more effectively. The objective here is to get everyone +on the same page about what the model can and cannot do from a fairness +perspective. We believe everyone should do this, considering how easy +it is. Other methods like bias audits are also useful, as the +Aequitas +team at UChicago shows.

                      +

                      +

                      A Code of Ethics?

                      +

                      AI is a reflection of society. It’s impossible to expect AI to be +completely unbiased when humans still struggle with the problem. +However, we can try our best to ensure that these biases are not +amplified by AI and mitigate any such damage. Making fairness and ethics +a routine part of AI development by professionals and teams is crucial +to addressing the challenge. Perhaps an AI code of ethics (akin to +the Hippocratic Oath) would make sense!

                      +

                      7 - Where To Learn More

                      +

                      Here are some links to learn more:

                      +
                        +
                      1. https://ethics.fast.ai/: a course by the fast.ai team on practical data ethics consisting of 6 lectures.
                      2. +
                      3. CS 294: Fairness in Machine Learning: A graduate course (similar to FSDL) taught at Berkeley in 2017 about AI ethics.
                      4. +
                      5. Fair ML Book: A book being written by the instructor of the aforementioned course on fair ML.
                      6. +
                      7. KDD Tutorial on Fair ML: Taught by folks from CMU, this is a workshop addressing some of the topics in this lecture.
                      8. +
                      9. The Alignment Problem: a book that confronts present-day issues in AI alignment.
                      10. +
                      11. Weapons of Math Destruction: a popular book about current issues like Facebook’s News Feed.
                      12. +
                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Deploying and Monitoring ML Models.pdf b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Deploying and Monitoring ML Models.pdf new file mode 100644 index 00000000..0f57f25c Binary files /dev/null and b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Deploying and Monitoring ML Models.pdf differ diff --git a/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - ML Testing and Explainability.pdf b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - ML Testing and Explainability.pdf new file mode 100644 index 00000000..56e0b094 Binary files /dev/null and b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - ML Testing and Explainability.pdf differ diff --git a/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Machine-Learning-Teams.pdf b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Machine-Learning-Teams.pdf new file mode 100644 index 00000000..aa1f8b5c Binary files /dev/null and b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Machine-Learning-Teams.pdf differ diff --git a/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Research Directions.pdf b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Research Directions.pdf new file mode 100644 index 00000000..c8b12259 Binary files /dev/null and b/spring2021/lecture-notes-pdfs/FSDL Spring 2021 - Research Directions.pdf differ diff --git a/spring2021/notebook-1/index.html b/spring2021/notebook-1/index.html new file mode 100644 index 00000000..b75de723 --- /dev/null +++ b/spring2021/notebook-1/index.html @@ -0,0 +1,2271 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Notebook: Coding a neural net - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Notebook: Coding a neural net

                      +

                      Video

                      + + +

                      Notebook by Sergey Karayev.

                      +

                      In this video, we code a neural network from scratch. You'll get familiar with the Google Colab environment, create a simple linear regression model using only Numpy, and build a multi-layer perception regression model using NumPy, PyTorch, and Keras.

                      +
                        +
                      • 0:30​ - Colab Notebook 101
                      • +
                      • 5:30​ - Numerical computing via NumPy
                      • +
                      • 10:15​ - Plotting via Matplotlib
                      • +
                      • 11:33​ - Basic regression with a linear model
                      • +
                      • 24:30​ - Basic regression with a multi-layer perceptron
                      • +
                      +

                      Follow Along

                      +

                      Google Colab

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/panel/index.html b/spring2021/panel/index.html new file mode 100644 index 00000000..0427c94b --- /dev/null +++ b/spring2021/panel/index.html @@ -0,0 +1,2250 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Panel Discussion: Do I need a PhD to work in ML? - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Panel Discussion: Do I need a PhD to work in ML?

                      +

                      We gathered a few people to offer different perspectives on whether grad school is required to work on interesting ML problems.

                      +

                      Moderated by Sergey Karayev.

                      +

                      The panelists:

                      +
                        +
                      • Pieter Abbeel - Professor at UC Berkeley
                      • +
                      • Georgia Gkioxari - Research Scientist at Facebook, formerly PhD Berkeley
                      • +
                      • Peter Gao - Co-founder and CEO of Aquarium Learning, formerly Cruise Automation
                      • +
                      • Anil Jason - Co-founder and CTO of Quillbot
                      • +
                      +

                      Video

                      + + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/projects/index.html b/spring2021/projects/index.html new file mode 100644 index 00000000..ffa92381 --- /dev/null +++ b/spring2021/projects/index.html @@ -0,0 +1,2574 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Course Projects Showcase - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      + + + + + + + + +

                      Course Projects Showcase

                      +

                      The final project is the most important as well as the most fun part of the course. +Students worked individually or in pairs over the duration of the course to complete a project involving any part of the full stack of deep learning.

                      +

                      The Top 10 projects, as selected by our course TAs, we viewed together with everyone, and posted the video on YouTube.

                      + + +

                      The details of these and other projects are below

                      +

                      Artificial Manga Panel Dataset

                      + +

                      Breast Cancer Detection Assistant

                      + +

                      Human Protein Atlas Single Cell Classification

                      + +

                      A Case Study on Weakly Supervised Learning

                      + +

                      Real-Time Nail Biting Alerter

                      + +

                      YouTube Highlighter for Creators and Their Supporters

                      + +

                      Neural Rock

                      + +

                      Unity Machine Learning Agents

                      + +

                      Sitting Posture Coach

                      + +

                      Intel Scene Classification

                      + +

                      Bird Song Classifier

                      + +

                      Detecting Deforestation from Satellite Images

                      + +

                      Active Learning: A Systematic Investigation

                      + +

                      Drought Watch

                      + + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/spring2021/synchronous/index.html b/spring2021/synchronous/index.html new file mode 100644 index 00000000..aeaa330d --- /dev/null +++ b/spring2021/synchronous/index.html @@ -0,0 +1,2391 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Synchronous Online Course - The Full Stack + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + + + +
                      + + + + +
                      + + +
                      + +
                      + + + + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + + +

                      Synchronous Online Course

                      +

                      We offered a paid cohort experience with the following additions to the lecture and lab materials released publicly:

                      +
                        +
                      • Slack workspace for learners, instructors, and teaching assistants
                      • +
                      • Weekly graded assignment
                      • +
                      • Capstone project reviewed by peers and staff
                      • +
                      • Certificate of completion
                      • +
                      +

                      Check out the original announcement page.

                      +

                      How do I know if I am in this course?

                      +

                      If you registered and received an email receipt from Stripe, you're in, and should have been added to our Slack workspace on February 1.

                      +

                      Please email us if you have a Stripe receipt but aren't in our Slack.

                      +

                      Teaching Assistants

                      +

                      This course is only possible with the support of our amazing TAs:

                      +
                        +
                      • Head TA James Le runs Data Relations for Superb AI and contributes to Data Journalism for Snorkel AI, after getting an MS in Recommendation Systems at RIT.
                      • +
                      • Daniel Cooper is a machine learning engineer at QuantumWork, SaaS for recruiters.
                      • +
                      • Han Lee is a Senior Data Scientist at WalletHub. Prior to that, he worked on various DS, MLE, and quant roles. Previously, he co-managed TEFQX.
                      • +
                      • Nadia Ahmed is a machine learning researcher with The Frontier Development Lab and Trillium Technologies in remote sensing for severe weather and flood events.
                      • +
                      • Andrew Mendez is a Senior Machine Learning Engineer at Clarifai, developing large scale computer vision and machine learning systems for the public sector. Previously he was a ML Engineer at CACI.
                      • +
                      • Vishnu Rachakonda is a Machine Learning Engineer at Tesseract Health, a retinal imaging company, where he builds machine learning models for workflow augmentation and diagnostics in on-device and cloud use cases.
                      • +
                      • Chester Chen is the Director of Data Science Engineering at GoPro. He also founded the SF Big Analytics Meetup.
                      • +
                      +

                      Schedule

                      +

                      While we post lectures once a week starting February 1, the first four weeks are review lectures -- stuff you should already know from other courses.

                      +

                      On March 1, we get to the Full Stack content, and you will begin doing weekly assignments, discussing in Slack, and thinking about their course project.

                      +

                      Logistics

                      +

                      All learners, instructors, and TAs will be part of a Slack workspace. The Slack community is a crucial part of the course: a place to meet each other, post helpful links, share experiences, ask and answer questions.

                      +

                      On Monday, we post the lecture and lab videos for you to watch. Post questions, ideas, articles in Slack as you view the materials.

                      +

                      On Thursday, we go live on Zoom to discuss the posted questions and ideas. +We have two 30-min slots: 9am and 6pm Pacific Time. +We will send everyone a Google Calendar invite with the Zoom meeting information.

                      +

                      You have until Friday night to finish the assignment via Gradescope, which will be graded by next Tuesday, so that you have prompt feedback.

                      +

                      Labs are not graded and can be done on your own.

                      +

                      Projects

                      +

                      The final project is the most important as well as the most fun part of the course. You can pair up or work individually. The project can involve any part of the full stack of deep learning, and should take you roughly 40 hours per person, over 5 weeks.

                      +

                      Projects will be presented as five-minute videos and associated reports, and open sourcing the code is highly encouraged. All projects will be posted for peer and staff review.

                      +

                      The best projects will be awarded and publicized by Full Stack Deep Learning.

                      +

                      If you want to find a partner, please post in the #spring2021-projects Slack channel with your idea or just that you're available to pair up.

                      +

                      Project proposals are due on Gradescope a few weeks into the course.

                      +

                      Please read more information about the projects.

                      +

                      Certificate

                      +

                      Those who complete the assignments and project will receive a certificate that can, for example, be displayed on LinkedIn.

                      +

                      Time Commitment

                      +

                      On average, expect to spend 5-10 hours per week on the course.

                      + + + + + + + + + +
                      +
                      +

                      We are excited to share this course with you for free.

                      +

                      + We have more upcoming great content. + Subscribe to stay up to date as we release it. +

                      +

                      +

                      + +
                      + +
                      +

                      +

                      + + We take your privacy and attention very seriously and will never spam you. + + I am already a subscriber +

                      +
                      +
                      + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + \ No newline at end of file diff --git a/stylesheets/extra.css b/stylesheets/extra.css new file mode 100644 index 00000000..b27a4511 --- /dev/null +++ b/stylesheets/extra.css @@ -0,0 +1,480 @@ +/* + tailwind-ish +*/ + +.row { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(360px, 1fr)); + grid-gap: 1rem; + align-items: flex-start; +} + +.flex { + display: flex; +} + +.flex-col { + flex-direction: column; +} + +.flex-wrap { + flex-wrap: wrap; +} + +.gap-x-4 { + column-gap: 2rem; +} + +.gap-x-8 { + column-gap: 4rem; +} + + +.items-center { + align-items: center; +} + +.justify-center { + justify-content: center; +} + +.m-auto { + margin: auto; +} + +.m-0 { + margin: 0px; +} + +.mb-2 { + margin-bottom: 1rem; +} + +.mb-4 { + margin-bottom: 2rem; +} + +.mb-8 { + margin-bottom: 4rem; +} + +.mt-4 { + margin-top: 2rem; +} + +.mx-2 { + margin-left: 0.8rem; + margin-right: 0.8rem; +} + +.mx-4 { + margin-left: 2rem; + margin-right: 2rem; +} + +.mx-8 { + margin-left: 4rem; + margin-right: 4rem; +} + +.pb-2 { + padding-bottom: 1rem; +} + +.pt-2 { + padding-top: 1rem; +} + +.pt-4 { + padding-top: 2rem; +} + +.px-2 { + padding-left: 0.8rem; + padding-right: 0.8rem; +} + +.px-4 { + padding-left: 2rem; + padding-right: 2rem; +} + +.py-4 { + padding-top: 2rem; + padding-bottom: 2rem; +} + +.font-large { + font-size: large; +} + +.flex { + display: flex; +} + +.justify-between { + justify-content: space-between; +} + +.grid-2 { + display: grid; + grid-template-columns: repeat(1, 1fr); + grid-template-rows: 1fr; + grid-column-gap: 1rem; + grid-row-gap: 1rem; +} + +@media (min-width: 768px) { + .grid-2 { + grid-template-columns: repeat(2, 1fr); + } +} + +/* 3 columns if on desktop, 2 column if on mobile */ +.grid-3 { + display: grid; + grid-template-columns: repeat(2, 1fr); + grid-template-rows: 1fr; + grid-column-gap: 0.5rem; + grid-row-gap: 0.5rem; +} + +@media (min-width: 768px) { + .grid-3 { + grid-template-columns: repeat(3, 1fr); + } +} + +.subtitle { + font-size: 1.25rem; + font-weight: 500; + margin-bottom: 0.5rem; +} + +/* + pricing and tiers +*/ + +.pricing { + display: grid; + grid-template-columns: repeat(1, minmax(0, 1fr)); + grid-gap: 1rem; +} + +@media (min-width: 768px) { + .pricing { + grid-template-columns: repeat(2, minmax(0, 1fr)); + } +} + +.tiers { + display: grid; + grid-template-columns: repeat(1, 1fr); + grid-template-rows: 1fr; + grid-column-gap: 10px; + grid-row-gap: 0px; +} + +@media (min-width: 768px) { + .tiers { + grid-template-columns: repeat(2, 1fr); + } +} + +.tier { + margin-top: 0.5rem; + margin-bottom: 0.5rem; + display: flex; + flex-direction: column; + border-radius: 0.375rem; + border-width: 1px; + border-color: rgb(203 213 225); + border-style: solid; + padding: 1rem; + text-align: center; +} + +.tier--header { + font-weight: bold; + font-size: 1.25rem; + line-height: 1.75rem; + margin-bottom: 0.5rem; +} + +.tier--price { + font-size: 1.125rem; + line-height: 1.75rem; + margin-bottom: 0.5rem; +} + +.tier--priceCaption { + font-size: 0.75rem; + line-height: 1rem; +} + +.tier--priceCaption div { + margin-bottom: 0.25rem; +} + +.tier--features { + text-align: left; + margin-bottom: 0.5rem; +} + +.highlight { + color: rgb(59 130 246); +} + +/* + admonitions +*/ + +.md-typeset .admonition, +.md-typeset details { + font-size: inherit; +} + +.admonition.abstract p, +.admonition.abstract ul { + font-size: large; +} +[dir="ltr"] .md-typeset .admonition.abstract .admonition-title { + padding-left: 0.6rem; + text-align: center; +} +.admonition.abstract .admonition-title::before { + display: none; +} +.md-typeset .admonition.abstract > :last-child { + margin-bottom: 1.25rem; +} + +/* + tables +*/ + +.md-typeset table:not([class]) { + font-size: inherit; + line-height: inherit; +} + +.md-typeset table:not([class]) td { + vertical-align: middle; +} + +.md-typeset table:not([class]) th { + white-space: nowrap; + min-width: 5rem; +} + +/* + circular images +*/ + +.person { + display: grid; + align-items: center; + grid-template-columns: 100px 1fr; + grid-column-gap: 1rem; +} + +.person--image { + border-radius: 50%; + border: 4px solid #4350af; + background-color: white; +} + +.logo--image { + border-radius: 50%; + border: 4px solid #4350af; + background-color: white; +} + +/* + interactive tables +*/ + +.handsontable .htDimmed { + color: #000; +} +.handsontable { + z-index: 0; +} + + +/* +email subscription modal +*/ + +#emailModal { + display: none; + position: fixed; + z-index: 1000; + left: 0; + top: 0; + width: 100%; + height: 100%; + overflow: auto; + background-color: rgba(0, 0, 0, 0.5); +} + +#emailModal div, #emailModal input { + /* Override dark/light theming */ + background-color: #fefefe; + color: #111; +} + +#emailModal > div { + margin: 15% auto; + padding: 30px; + border: 1px solid #888; + width: 67%; + max-width: 960px; +} + +#emailModal p, +#emailModal h2 { + margin-top: 0; +} + +#emailModal p:last-child { + margin-bottom: 0; +} + +#emailInput { + width: 100%; + padding: 12px 20px; + margin-bottom: 1rem; + border: 1px solid #ccc; +} + +#submitEmail { + width: 100%; +} + +/* + card display of images for videos +*/ + +a.video-card > img { + filter: drop-shadow(0 0 0.1rem #1c1c1c); + border-radius: 0.1rem; + aspect-ratio: 16 / 9; + width: 100%; + object-fit: cover; +} + +a.video-card:hover > img { + filter: brightness(0.85) drop-shadow(0 0 0.1rem #1c1c1c); +} + +/* + blog +*/ +.md-typeset .blog-post-description { + margin-bottom: 0; +} + +/* + assorted +*/ + +.whitespace-nowrap { + white-space: nowrap; +} + +h1.h1-with-author { + margin: 0; +} + +.twitter { + color: #1da1f2; +} + +.youtube { + color: #ff0000; +} + +h1.title { + /* color: white; */ + font-weight: bold; +} + +h2 { + font-size: 1.5rem; + font-weight: normal; +} + +.md-typeset a { + font-weight: bold; +} + +.md-banner__inner { + text-align: center; +} + +.md-banner__inner a { + color: white; +} + +#mc_embed_signup input.email { + width: auto !important; +} + +.testimonials { + display: grid; + grid-template-columns: 1fr; + grid-column-gap: 0.5rem; + grid-row-gap: 0.25rem; +} + +@media (min-width: 768px) { + .testimonials { + grid-template-columns: 1fr 1fr; + } +} + +.primary-swapped-bg-fg .md-button--primary { + background-color: #D17A28; + color: var(--md-primary-bg-color); + border-color: #D17A28; + border-radius: 0.5rem; +} + +.primary-swapped-bg-fg .md-button--primary:hover { + color: var(--md-primary-bg-color) +} + +.disabled-button { + pointer-events: none; +} + +.primary-swapped-bg-fg { + background-color: var(--md-primary-fg-color); + color: var(--md-primary-bg-color) +} + +.primary-swapped-bg-fg a { + color: var(--md-primary-bg-color) +} + +/* swap order of elements on desktop */ +.swap-last { + order: 9999; +} + +@media (min-width: 768px) { + .swap-last { + order: -9999; + } +} + +.swap-first { + order: -9999; +} + +@media (min-width: 768px) { + .swap-first { + order: 9999; + } +}

The in-person event will be at the +South San Francisco Conference Center.