DLC.bib

% Generated by Paperpile. Check out http://paperpile.com for more information.
% BibTeX export options can be customized via Settings -> BibTeX.

@ARTICLE{Pascanu2017-jl,
  title         = "Learning model-based planning from scratch",
  author        = "Pascanu, Razvan and Li, Yujia and Vinyals, Oriol and Heess,
                   Nicolas and Buesing, Lars and Racani{\`e}re, Sebastien and
                   Reichert, David and Weber, Th{\'e}ophane and Wierstra, Daan
                   and Battaglia, Peter",
  abstract      = "Conventional wisdom holds that model-based planning is a
                   powerful approach to sequential decision-making. It is often
                   very challenging in practice, however, because while a model
                   can be used to evaluate a plan, it does not prescribe how to
                   construct a plan. Here we introduce the ``Imagination-based
                   Planner'', the first model-based, sequential decision-making
                   agent that can learn to construct, evaluate, and execute
                   plans. Before any action, it can perform a variable number
                   of imagination steps, which involve proposing an imagined
                   action and evaluating it with its model-based imagination.
                   All imagined actions and outcomes are aggregated,
                   iteratively, into a ``plan context'' which conditions future
                   real and imagined actions. The agent can even decide how to
                   imagine: testing out alternative imagined actions, chaining
                   sequences of actions together, or building a more complex
                   ``imagination tree'' by navigating flexibly among the
                   previously imagined states using a learned policy. And our
                   agent can learn to plan economically, jointly optimizing for
                   external rewards and computational costs associated with
                   using its imagination. We show that our architecture can
                   learn to solve a challenging continuous control problem, and
                   also learn elaborate planning strategies in a discrete
                   maze-solving task. Our work opens a new direction toward
                   learning the components of a model-based planning system and
                   how to use them.",
  month         =  "19~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1707.06170"
}

@ARTICLE{Luo_undated-qj,
  title  = "Learning Deep Architectures via Generalized Whitened Neural
            Networks",
  author = "Luo, Ping"
}

@ARTICLE{Cisse2017-rg,
  title         = "Houdini: Fooling Deep Structured Prediction Models",
  author        = "Cisse, Moustapha and Adi, Yossi and Neverova, Natalia and
                   Keshet, Joseph",
  abstract      = "Generating adversarial examples is a critical step for
                   evaluating and improving the robustness of learning
                   machines. So far, most existing methods only work for
                   classification and are not designed to alter the true
                   performance measure of the problem at hand. We introduce a
                   novel flexible approach named Houdini for generating
                   adversarial examples specifically tailored for the final
                   performance measure of the task considered, be it
                   combinatorial and non-decomposable. We successfully apply
                   Houdini to a range of applications such as speech
                   recognition, pose estimation and semantic segmentation. In
                   all cases, the attacks based on Houdini achieve higher
                   success rate than those based on the traditional surrogates
                   used to train the models while using a less perceptible
                   adversarial perturbation.",
  month         =  "17~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1707.05373"
}

@ARTICLE{Melis2017-jk,
  title         = "On the State of the Art of Evaluation in Neural Language
                   Models",
  author        = "Melis, G{\'a}bor and Dyer, Chris and Blunsom, Phil",
  abstract      = "Ongoing innovations in recurrent neural network
                   architectures have provided a steady influx of apparently
                   state-of-the-art results on language modelling benchmarks.
                   However, these have been evaluated using differing code
                   bases and limited computational resources, which represent
                   uncontrolled sources of experimental variation. We
                   reevaluate several popular architectures and regularisation
                   methods with large-scale automatic black-box hyperparameter
                   tuning and arrive at the somewhat surprising conclusion that
                   standard LSTM architectures, when properly regularised,
                   outperform more recent models. We establish a new state of
                   the art on the Penn Treebank and Wikitext-2 corpora, as well
                   as strong baselines on the Hutter Prize dataset.",
  month         =  "18~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1707.05589"
}

@ARTICLE{Tozzi2017-ah,
  title       = "Topodynamics of metastable brains",
  author      = "Tozzi, Arturo and Peters, James F and Fingelkurts, Andrew A
                 and Fingelkurts, Alexander A and Mariju{\'a}n, Pedro C",
  affiliation = "Center for Nonlinear Science, University of North Texas, 1155
                 Union Circle, \#311427, Denton, TX 76203-5017, USA. Electronic
                 address: tozziarturo@libero.it. Department of Electrical and
                 Computer Engineering, University of Manitoba, 75A Chancellor's
                 Circle Winnipeg, MB R3T 5V6 Canada; Department of Mathematics,
                 Ad{\i}yaman University, 02040 Ad{\i}yaman, Turkey. Electronic
                 address: James.Peters3@umanitoba.ca. BM-Science - Brain and
                 Mind Technologies Research Centre, Espoo, Finland. Electronic
                 address: andrew.fingelkurts@bm-science.com. BM-Science - Brain
                 and Mind Technologies Research Centre, Espoo, Finland.
                 Electronic address: alexander.fingelkurts@bm-science.com.
                 Bioinformation Group, Aragon Institute of Health Science
                 (IACS), Aragon Health Research Institute (IIS Aragon),
                 Zaragoza, 50009 Spain. Electronic address:
                 pcmarijuan.iacs@aragon.es.",
  abstract    = "The brain displays both the anatomical features of a vast
                 amount of interconnected topological mappings as well as the
                 functional features of a nonlinear, metastable system at the
                 edge of chaos, equipped with a phase space where mental random
                 walks tend towards lower energetic basins. Nevertheless, with
                 the exception of some advanced neuro-anatomic descriptions and
                 present-day connectomic research, very few studies have been
                 addressing the topological path of a brain embedded or
                 embodied in its external and internal environment. Herein, by
                 using new formal tools derived from algebraic topology, we
                 provide an account of the metastable brain, based on the
                 neuro-scientific model of Operational Architectonics of
                 brain-mind functioning. We introduce a ``topodynamic''
                 description that shows how the relationships among the
                 countless intertwined spatio-temporal levels of brain
                 functioning can be assessed in terms of projections and
                 mappings that take place on abstract structures, equipped with
                 different dimensions, curvatures and energetic constraints.
                 Such a topodynamical approach, apart from providing a
                 biologically plausible model of brain function that can be
                 operationalized, is also able to tackle the issue of a
                 long-standing dichotomy: it throws indeed a bridge between the
                 subjective, immediate datum of the na{\"\i}ve complex of
                 sensations and mentations and the objective, quantitative,
                 data extracted from experimental neuro-scientific procedures.
                 Importantly, it opens the door to a series of new predictions
                 and future directions of advancement for neuroscientific
                 research.",
  journal     = "Phys. Life Rev.",
  month       =  "23~" # mar,
  year        =  2017,
  keywords    = "Borsuk--Ulam theorem; Central nervous system; Mind; Nonlinear
                 dynamics; Topology",
  language    = "en"
}

@ARTICLE{Gomez2017-pn,
  title         = "The Reversible Residual Network: Backpropagation Without
                   Storing Activations",
  author        = "Gomez, Aidan N and Ren, Mengye and Urtasun, Raquel and
                   Grosse, Roger B",
  abstract      = "Deep residual networks (ResNets) have significantly pushed
                   forward the state-of-the-art on image classification,
                   increasing in performance as networks grow both deeper and
                   wider. However, memory consumption becomes a bottleneck, as
                   one needs to store the activations in order to calculate
                   gradients using backpropagation. We present the Reversible
                   Residual Network (RevNet), a variant of ResNets where each
                   layer's activations can be reconstructed exactly from the
                   next layer's. Therefore, the activations for most layers
                   need not be stored in memory during backpropagation. We
                   demonstrate the effectiveness of RevNets on CIFAR-10,
                   CIFAR-100, and ImageNet, establishing nearly identical
                   classification accuracy to equally-sized ResNets, even
                   though the activation storage requirements are independent
                   of depth.",
  month         =  "14~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1707.04585"
}

@ARTICLE{Teh2017-bq,
  title         = "Distral: Robust Multitask Reinforcement Learning",
  author        = "Teh, Yee Whye and Bapst, Victor and Czarnecki, Wojciech
                   Marian and Quan, John and Kirkpatrick, James and Hadsell,
                   Raia and Heess, Nicolas and Pascanu, Razvan",
  abstract      = "Most deep reinforcement learning algorithms are data
                   inefficient in complex and rich environments, limiting their
                   applicability to many scenarios. One direction for improving
                   data efficiency is multitask learning with shared neural
                   network parameters, where efficiency may be improved through
                   transfer across related tasks. In practice, however, this is
                   not usually observed, because gradients from different tasks
                   can interfere negatively, making learning unstable and
                   sometimes even less data efficient. Another issue is the
                   different reward schemes between tasks, which can easily
                   lead to one task dominating the learning of a shared model.
                   We propose a new approach for joint training of multiple
                   tasks, which we refer to as Distral (Distill \& transfer
                   learning). Instead of sharing parameters between the
                   different workers, we propose to share a ``distilled''
                   policy that captures common behaviour across tasks. Each
                   worker is trained to solve its own task while constrained to
                   stay close to the shared policy, while the shared policy is
                   trained by distillation to be the centroid of all task
                   policies. Both aspects of the learning process are derived
                   by optimizing a joint objective function. We show that our
                   approach supports efficient transfer on complex 3D
                   environments, outperforming several related methods.
                   Moreover, the proposed learning process is more robust and
                   more stable---attributes that are critical in deep
                   reinforcement learning.",
  month         =  "13~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1707.04175"
}

@INPROCEEDINGS{Fernandes2015-fn,
  title      = "A Proactive Intelligent Decision Support System for Predicting
                the Popularity of Online News",
  booktitle  = "Progress in Artificial Intelligence",
  author     = "Fernandes, Kelwin and Vinagre, Pedro and Cortez, Paulo",
  abstract   = "Due to the Web expansion, the prediction of online news
                popularity is becoming a trendy research topic. In this paper,
                we propose a novel and proactive Intelligent Decision Support
                System (IDSS) that analyzes articles prior to their
                publication. Using a broad set of extracted features (e.g.,
                keywords, digital media content, earlier popularity of news
                referenced in the article) the IDSS first predicts if an
                article will become popular. Then, it optimizes a subset of the
                articles features that can more easily be changed by authors,
                searching for an enhancement of the predicted popularity
                probability. Using a large and recently collected dataset, with
                39,000 articles from the Mashable website, we performed a
                robust rolling windows evaluation of five state of the art
                models. The best result was provided by a Random Forest with a
                discrimination power of 73\%. Moreover, several stochastic hill
                climbing local searches were explored. When optimizing 1000
                articles, the best optimization method obtained a mean gain
                improvement of 15 percentage points in terms of the estimated
                popularity probability. These results attest the proposed IDSS
                as a valuable tool for online news authors.",
  publisher  = "Springer, Cham",
  pages      = "535--546",
  month      =  "8~" # sep,
  year       =  2015,
  keywords   = "Viralization",
  language   = "en",
  conference = "Portuguese Conference on Artificial Intelligence"
}

@ARTICLE{Lloret2016-ze,
  title    = "Analysing and evaluating the task of automatic tweet generation:
              Knowledge to business",
  author   = "Lloret, Elena and Palomar, Manuel",
  abstract = "In this paper a study concerning the evaluation and analysis of
              natural language tweets is presented. Based on our experience in
              text summarisation, we carry out a deep analysis on user's
              perception through the evaluation of tweets manual and
              automatically generated from news. Specifically, we consider two
              key issues of a tweet: its informativeness and its
              interestingness. Therefore, we analyse: (1) do users equally
              perceive manual and automatic tweets?; (2) what linguistic
              features a good tweet may have to be interesting, as well as
              informative? The main challenge of this proposal is the analysis
              of tweets to help companies in their positioning and reputation
              on the Web. Our results show that: (1) automatically informative
              and interesting natural language tweets can be generated as a
              result of summarisation approaches; and (2) we can characterise
              good and bad tweets based on specific linguistic features not
              present in other types of tweets.",
  journal  = "Comput. Ind.",
  volume   =  78,
  pages    = "3--15",
  month    =  "1~" # may,
  year     =  2016,
  keywords = "Natural language processing; Text summarisation; Natural language
              tweet generation; User study; Linguistic analysis; Descriptive
              statistics;Viralization"
}

@ARTICLE{Varol2017-el,
  title     = "Analyzing Social Big Data to Study Online Discourse and Its
               Manipulation",
  author    = "Varol, Onur",
  abstract  = "The widespread use of social media helps people connect and
               share their opinions and experiences with millions of others,
               while simultaneously bringing new threats. This dissertation
               aims to provide insights into analysis of online conversations
               and mechanisms that might be used for their manipulation. The
               first part delves into the effect of geography on information
               dissemination and user roles in online discourse. I study
               trending topics on Twitter to highlight mechanisms governing the
               diffusion of local and national trends. My analysis points to
               three locally geographic regions and one cluster that contains
               trendsetting cities coinciding with major travel hubs. When
               factors limiting information spread are considered, censorship
               mechanisms mandated by governments are found to be ineffective
               and even show a correlation with increasing popularity. I also
               present an analysis of spatiotemporal characteristics and
               distinct user roles in the Gezi movement. Next, I discuss
               different forms of social media manipulation. Malicious entities
               can employ promotion campaigns and social bots. We build machine
               learning frameworks that exploit features extracted from
               network, content, and users to train accurate supervised
               learning models. Our system for early detection of promoted
               social media trends harnesses multidimensional time series
               signals to reveal subtle differences between promoted and
               organic trends. In my research on social bots, I carried out the
               largest study of the human-bot ecosystem to date. Our estimates
               suggest that between 9 and 15\% of active Twitter accounts are
               bots. I present distinct behavioral groups and interaction
               strategies among human and bot accounts. This body of work
               contributes to a more comprehensive understanding of online user
               behavior and to the development of systems to detect online
               abuse.",
  publisher = "[Bloomington, Ind.] : Indiana University",
  month     =  jun,
  year      =  2017,
  keywords  = "Network Science; Social Media Analysis; Bot Detection; Doctoral
               Dissertation;Viralization",
  language  = "en"
}

@ARTICLE{He2016-em,
  title         = "Deep Reinforcement Learning with a Combinatorial Action
                   Space for Predicting Popular Reddit Threads",
  author        = "He, Ji and Ostendorf, Mari and He, Xiaodong and Chen,
                   Jianshu and Gao, Jianfeng and Li, Lihong and Deng, Li",
  abstract      = "We introduce an online popularity prediction and tracking
                   task as a benchmark task for reinforcement learning with a
                   combinatorial, natural language action space. A specified
                   number of discussion threads predicted to be popular are
                   recommended, chosen from a fixed window of recent comments
                   to track. Novel deep reinforcement learning architectures
                   are studied for effective modeling of the value function
                   associated with actions comprised of interdependent
                   sub-actions. The proposed model, which represents dependence
                   between sub-actions through a bi-directional LSTM, gives the
                   best performance across different experimental
                   configurations and domains, and it also generalizes well
                   with varying numbers of recommendation requests.",
  month         =  "12~" # jun,
  year          =  2016,
  keywords      = "Viralization",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.03667"
}

@INPROCEEDINGS{Wang2015-la,
  title     = "{I} Can Has Cheezburger? A Nonparanormal Approach to Combining
               Textual and Visual Information for Predicting and Generating
               Popular Meme Descriptions",
  booktitle = "{HLT-NAACL}",
  author    = "Wang, William Yang and Wen, Miaomiao",
  pages     = "355--365",
  year      =  2015,
  keywords  = "Viralization"
}

@ARTICLE{Farzindar2015-vx,
  title     = "Natural Language Processing for Social Media",
  author    = "Farzindar, Atefeh and Inkpen, Diana",
  abstract  = "Abstract In recent years, online social networking has
               revolutionized interpersonal communication. The newer research
               on language analysis in social media has been increasingly
               focusing on the latter's impact on our daily lives, both on a
               personal and a professional level. Natural language processing
               (NLP) is one of the most promising avenues for social media data
               processing. It is a scientific challenge to develop powerful
               methods and algorithms which extract relevant information from a
               large volume of data coming from multiple sources and languages
               in various formats or in free form. We discuss the challenges in
               analyzing social media texts in contrast with traditional
               documents. Research methods in information extraction, automatic
               categorization and clustering, automatic summarization and
               indexing, and statistical machine translation need to be adapted
               to a new kind of data. This book reviews the current research on
               Natural Language Processing (NLP) tools and methods for
               processing the non-traditional information from social media
               data that is available in large amounts (big data), and shows
               how innovative NLP approaches can integrate appropriate
               linguistic information in various fields such as social media
               monitoring, health care, business intelligence, industry,
               marketing, and security and defense. We review the existing
               evaluation metrics for NLP and social media applications, and
               the new efforts in evaluation campaigns or shared tasks on new
               datasets collected from social media. Such tasks are organized
               by the Association for Computational Linguistics (such as
               SemEval tasks) or by the National Institute of Standards and
               Technology via the Text REtrieval Conference (TREC) and the Text
               Analysis Conference (TAC). In the concluding chapter, we discuss
               the importance of this dynamic discipline and its great
               potential for NLP in the coming decade, in the context of
               changes in mobile technology, cloud computing, and social
               networking. Table of Contents: Preface / Acknowledgments /
               Introduction to Social Media Analysis / Linguistic
               Pre-processing\textbackslash\textbackslash of Social Media Texts
               / Semantic Analysis of Social Media Texts / Applications of
               Social Media Text Analysis / Data Collection, Annotation, and
               Evaluation / Conclusion and Perspectives / Glossary /
               Bibliography / Authors' Biographies",
  journal   = "Synthesis Lectures on Human Language Technologies",
  publisher = "Morgan \& Claypool Publishers",
  volume    =  8,
  number    =  2,
  pages     = "1--166",
  month     =  "28~" # aug,
  year      =  2015,
  keywords  = "Viralization"
}

@ARTICLE{Tan2014-eo,
  title         = "The effect of wording on message propagation: Topic- and
                   author-controlled natural experiments on Twitter",
  author        = "Tan, Chenhao and Lee, Lillian and Pang, Bo",
  abstract      = "Consider a person trying to spread an important message on a
                   social network. He/she can spend hours trying to craft the
                   message. Does it actually matter? While there has been
                   extensive prior work looking into predicting popularity of
                   social-media content, the effect of wording per se has
                   rarely been studied since it is often confounded with the
                   popularity of the author and the topic. To control for these
                   confounding factors, we take advantage of the surprising
                   fact that there are many pairs of tweets containing the same
                   url and written by the same user but employing different
                   wording. Given such pairs, we ask: which version attracts
                   more retweets? This turns out to be a more difficult task
                   than predicting popular topics. Still, humans can answer
                   this question better than chance (but far from perfectly),
                   and the computational methods we develop can do better than
                   both an average human and a strong competing method trained
                   on non-controlled data.",
  month         =  "6~" # may,
  year          =  2014,
  keywords      = "Viralization",
  archivePrefix = "arXiv",
  primaryClass  = "cs.SI",
  eprint        = "1405.1438"
}

@MISC{noauthor_undated-pl,
  title        = "Want to be retweeted more? - Home",
  abstract     = "A demo for the paper The effect of wording on message
                  propagation: Topic- and author-controlled natural experiments
                  on Twitter by Chenhao Tan, Lillian Lee and Bo Pang. It can be
                  used to predict which tweet will be retweeted more among a
                  pair of tweets on the same topic.",
  howpublished = "\url{https://chenhaot.com/retweetedmore/}",
  note         = "Accessed: 2017-7-14",
  keywords     = "Viralization"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{noauthor_undated-fa,
  title        = "Using Deep Learning at Scale in Twitter’s Timelines",
  abstract     = "Using Deep Learning at Scale in Twitter’s Timelines",
  howpublished = "\url{https://blog.twitter.com/engineering/en_us/topics/insights/2017/using-deep-learning-at-scale-in-twitters-timelines.html}",
  note         = "Accessed: 2017-7-14",
  keywords     = "Viralization"
}

@MISC{Novet2017-nh,
  title        = "Twitter is now using a trendy type of {AI} to figure out
                  which tweets to show you",
  booktitle    = "{CNBC}",
  author       = "Novet, Jordan",
  abstract     = "Twitter has started using artificial intelligence to do a
                  better job of recommending relevant tweets at the top of
                  users' timelines.",
  publisher    = "CNBC",
  month        =  "29~" # mar,
  year         =  2017,
  howpublished = "\url{http://www.cnbc.com/2017/05/09/twitter-using-deep-learning-ai-to-rank-tweets.html}",
  note         = "Accessed: 2017-7-14",
  keywords     = "Viralization"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{Brownlee2016-ze,
  title        = "{MIT’s} {DeepDrumpf} Twitter Bot Uses Neural Networks To
                  Tweet Like Donald Trump",
  booktitle    = "{Co.Design}",
  author       = "Brownlee, John",
  abstract     = "And don’t worry. DeepLearnTheBern is next!",
  publisher    = "Co.Design",
  month        =  "4~" # mar,
  year         =  2016,
  howpublished = "\url{https://www.fastcodesign.com/3057501/mits-deepdrumpf-twitter-bot-uses-neural-networks-to-tweet-like-donald-trump}",
  note         = "Accessed: 2017-7-14",
  keywords     = "Viralization"
}

@ARTICLE{Stokowiec_undated-cg,
  title    = "Shallow reading with Deep Learning: Predicting popularity of
              online content using only its title",
  author   = "Stokowiec, Wojciech and Trzci, Tomasz and Lk, Krzysztof Wo and
              Marasek, Krzysztof and Rokita, Przemys Law",
  keywords = "Viralization"
}

@MISC{noauthor_undated-gz,
  title        = "Buzzfeed Title Generator - Ravi Parikh's Website",
  abstract     = "Official website of Ravi Parikh.",
  howpublished = "\url{http://www.ravi.io/buzzfeed-title-generator?mode=history}",
  note         = "Accessed: 2017-7-14",
  keywords     = "Viralization"
}

@MISC{noauthor_undated-mz,
  title        = "Buzzfeed \& Upworthy Clickbait Headline Generator",
  abstract     = "Looking for linkbait \& clickbait headline ideas for your
                  blog? Get popular Buzzfeed \& Upworthy article headlines
                  along with their social share count.",
  howpublished = "\url{http://www.contentforest.com/copywriting-tools/clickbait-headline-generator}",
  note         = "Accessed: 2017-7-14",
  keywords     = "Viralization"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{Larseidnes2015-ti,
  title        = "{Auto-Generating} Clickbait With Recurrent Neural Networks",
  booktitle    = "Lars Eidnes' blog",
  author       = "{larseidnes}",
  abstract     = "``F.D.R.'s War Plans!'' reads a headline from a 1941 Chicago
                  Daily Tribune. Had this article been written today, it might
                  rather have said ``21 War Plans F.D.R. Does Not Want You To
                  Know About. Number 6 may shock you!''. Modern writers have
                  become very good at squeezing out the maximum clickability
                  out of every headline.…",
  month        =  "13~" # oct,
  year         =  2015,
  howpublished = "\url{https://larseidnes.com/2015/10/13/auto-generating-clickbait-with-recurrent-neural-networks/}",
  note         = "Accessed: 2017-7-14",
  keywords     = "Viralization"
}

@ARTICLE{Blom2015-tv,
  title    = "Click bait: Forward-reference as lure in online news headlines",
  author   = "Blom, Jonas Nygaard and Hansen, Kenneth Reinecke",
  abstract = "This is why you should read this article. Although such an
              opening statement does not make much sense read in isolation,
              journalists often write headlines like this on news websites.
              They use the forward-referring technique as a stylistic and
              narrative luring device trying to induce anticipation and
              curiosity so the readers click (or tap on) the headline and read
              on. In this article, we map the use of forward-referring
              headlines in online news journalism by conducting an analysis of
              100,000 headlines from 10 different Danish news websites. The
              results show that commercialization and tabloidization seem to
              lead to a recurrent use of forward-reference in Danish online
              news headlines. In addition, the article contributes to reference
              theory by expanding previous models on phoricity to include
              multimodal references on the web.",
  journal  = "J. Pragmat.",
  volume   =  76,
  pages    = "87--100",
  month    =  "1~" # jan,
  year     =  2015,
  keywords = "Online news headlines; Forward-reference; Cataphora; Discourse
              deixis; Media commercialization; Tabloidization;Viralization"
}

@ARTICLE{Forbes2017-gw,
  title         = "Verb Physics: Relative Physical Knowledge of Actions and
                   Objects",
  author        = "Forbes, Maxwell and Choi, Yejin",
  abstract      = "Learning commonsense knowledge from natural language text is
                   nontrivial due to reporting bias: people rarely state the
                   obvious, e.g., ``My house is bigger than me.'' However,
                   while rarely stated explicitly, this trivial everyday
                   knowledge does influence the way people talk about the
                   world, which provides indirect clues to reason about the
                   world. For example, a statement like, ``Tyler entered his
                   house'' implies that his house is bigger than Tyler. In this
                   paper, we present an approach to infer relative physical
                   knowledge of actions and objects along five dimensions
                   (e.g., size, weight, and strength) from unstructured natural
                   language text. We frame knowledge acquisition as joint
                   inference over two closely related problems: learning (1)
                   relative physical knowledge of object pairs and (2) physical
                   implications of actions when applied to those object pairs.
                   Empirical results demonstrate that it is possible to extract
                   knowledge of actions and objects from language and that
                   joint inference over different types of knowledge improves
                   performance.",
  month         =  "12~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1706.03799"
}

@ARTICLE{noauthor_undated-ty,
  title    = "Multitask Learning for {Fine-Grained} Twitter Sentiment Analysis",
  author   = "Balikas, Georgios and Moura, Simon and Amini, Massih-Reza",
  abstract = "Traditional sentiment analysis approaches tackle problems like
              ternary (3-category) and fine-grained (5-category) classification
              by learning the tasks separately. We argue that such
              classification tasks are correlated and we propose a multitask
              approach based on a recurrent neural network that benefits by
              jointly learning them. Our study demonstrates the potential of
              multitask models on this type of problems and improves the
              state-of-the-art results in the fine-grained sentiment
              classification problem.",
  journal  = "arXiv [cs.IR]",
  month    =  "12~" # jul,
  year     =  2017
}

@ARTICLE{Evert_undated-mh,
  title    = "Bachelor of Science Thesis in Computer Science and Engineering",
  author   = "Evert, Alex and Genander, Jacob and Lallo, Nicklas and Lantz,
              Rickard and Nilsson, Filip",
  keywords = "Viralization"
}

@ARTICLE{Rony2017-sd,
  title         = "Diving Deep into Clickbaits: Who Use Them to What Extents in
                   Which Topics with What Effects?",
  author        = "Rony, Md Main Uddin and Hassan, Naeemul and Yousuf, Mohammad",
  abstract      = "The use of alluring headlines (clickbait) to tempt the
                   readers has become a growing practice nowadays. For the sake
                   of existence in the highly competitive media industry, most
                   of the on-line media including the mainstream ones, have
                   started following this practice. Although the wide-spread
                   practice of clickbait makes the reader's reliability on
                   media vulnerable, a large scale analysis to reveal this fact
                   is still absent. In this paper, we analyze 1.67 million
                   Facebook posts created by 153 media organizations to
                   understand the extent of clickbait practice, its impact and
                   user engagement by using our own developed clickbait
                   detection model. The model uses distributed sub-word
                   embeddings learned from a large corpus. The accuracy of the
                   model is 98.3\%. Powered with this model, we further study
                   the distribution of topics in clickbait and non-clickbait
                   contents.",
  month         =  "28~" # mar,
  year          =  2017,
  keywords      = "Viralization",
  archivePrefix = "arXiv",
  primaryClass  = "cs.SI",
  eprint        = "1703.09400"
}

@ARTICLE{Wei2017-qt,
  title         = "Learning to Identify Ambiguous and Misleading News Headlines",
  author        = "Wei, Wei and Wan, Xiaojun",
  abstract      = "Accuracy is one of the basic principles of journalism.
                   However, it is increasingly hard to manage due to the
                   diversity of news media. Some editors of online news tend to
                   use catchy headlines which trick readers into clicking.
                   These headlines are either ambiguous or misleading,
                   degrading the reading experience of the audience. Thus,
                   identifying inaccurate news headlines is a task worth
                   studying. Previous work names these headlines ``clickbaits''
                   and mainly focus on the features extracted from the
                   headlines, which limits the performance since the
                   consistency between headlines and news bodies is
                   underappreciated. In this paper, we clearly redefine the
                   problem and identify ambiguous and misleading headlines
                   separately. We utilize class sequential rules to exploit
                   structure information when detecting ambiguous headlines.
                   For the identification of misleading headlines, we extract
                   features based on the congruence between headlines and
                   bodies. To make use of the large unlabeled data set, we
                   apply a co-training method and gain an increase in
                   performance. The experiment results show the effectiveness
                   of our methods. Then we use our classifiers to detect
                   inaccurate headlines crawled from different sources and
                   conduct a data analysis.",
  month         =  "17~" # may,
  year          =  2017,
  keywords      = "Viralization",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1705.06031"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@INPROCEEDINGS{Anand2017-nf,
  title      = "We Used Neural Networks to Detect Clickbaits: You Won’t Believe
                What Happened Next!",
  booktitle  = "Advances in Information Retrieval",
  author     = "Anand, Ankesh and Chakraborty, Tanmoy and Park, Noseong",
  abstract   = "Online content publishers often use catchy headlines for their
                articles in order to attract users to their websites. These
                headlines, popularly known as clickbaits, exploit a user’s
                curiosity gap and lure them to click on links that often
                disappoint them. Existing methods for automatically detecting
                clickbaits rely on heavy feature engineering and domain
                knowledge. Here, we introduce a neural network architecture
                based on Recurrent Neural Networks for detecting clickbaits.
                Our model relies on distributed word representations learned
                from a large unannotated corpora, and character embeddings
                learned via Convolutional Neural Networks. Experimental results
                on a dataset of news headlines show that our model outperforms
                existing techniques for clickbait detection with an accuracy of
                0.98 with F1-score of 0.98 and ROC-AUC of 0.99.",
  publisher  = "Springer, Cham",
  pages      = "541--547",
  month      =  "8~" # apr,
  year       =  2017,
  keywords   = "Viralization",
  language   = "en",
  conference = "European Conference on Information Retrieval"
}

@ARTICLE{Sun2017-pr,
  title         = "Revisiting Unreasonable Effectiveness of Data in Deep
                   Learning Era",
  author        = "Sun, Chen and Shrivastava, Abhinav and Singh, Saurabh and
                   Gupta, Abhinav",
  abstract      = "The success of deep learning in vision can be attributed to:
                   (a) models with high capacity; (b) increased computational
                   power; and (c) availability of large-scale labeled data.
                   Since 2012, there have been significant advances in
                   representation capabilities of the models and computational
                   capabilities of GPUs. But the size of the biggest dataset
                   has surprisingly remained constant. What will happen if we
                   increase the dataset size by 10x or 100x? This paper takes a
                   step towards clearing the clouds of mystery surrounding the
                   relationship between `enormous data' and deep learning. By
                   exploiting the JFT-300M dataset which has more than 375M
                   noisy labels for 300M images, we investigate how the
                   performance of current vision tasks would change if this
                   data was used for representation learning. Our paper
                   delivers some surprising (and some expected) findings.
                   First, we find that the performance on vision tasks still
                   increases linearly with orders of magnitude of training data
                   size. Second, we show that representation learning (or
                   pre-training) still holds a lot of promise. One can improve
                   performance on any vision tasks by just training a better
                   base model. Finally, as expected, we present new
                   state-of-the-art results for different vision tasks
                   including image classification, object detection, semantic
                   segmentation and human pose estimation. Our sincere hope is
                   that this inspires vision community to not undervalue the
                   data and develop collective efforts in building larger
                   datasets.",
  month         =  "10~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1707.02968"
}

@ARTICLE{Costa2017-va,
  title         = "Automatic Generation of Natural Language Explanations",
  author        = "Costa, Felipe and Ouyang, Sixun and Dolog, Peter and Lawlor,
                   Aonghus",
  abstract      = "An important task for recommender system is to generate
                   explanations according to a user's preferences. Most of the
                   current methods for explainable recommendations use
                   structured sentences to provide descriptions along with the
                   recommendations they produce. However, those methods have
                   neglected the review-oriented way of writing a text, even
                   though it is known that these reviews have a strong
                   influence over user's decision. In this paper, we propose a
                   method for the automatic generation of natural language
                   explanations, for predicting how a user would write about an
                   item, based on user ratings from different items' features.
                   We design a character-level recurrent neural network (RNN)
                   model, which generates an item's review explanations using
                   long-short term memories (LSTM). The model generates text
                   reviews given a combination of the review and ratings score
                   that express opinions about different factors or aspects of
                   an item. Our network is trained on a sub-sample from the
                   large real-world dataset BeerAdvocate. Our empirical
                   evaluation using natural language processing metrics shows
                   the generated text's quality is close to a real user written
                   review, identifying negation, misspellings, and domain
                   specific vocabulary.",
  month         =  "4~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1707.01561"
}

@ARTICLE{noauthor_undated-qo,
  title    = "A causal framework for explaining the predictions of black-box
              sequence-to-sequence models",
  author   = "Alvarez-Melis, David and Jaakkola, Tommi S",
  abstract = "We interpret the predictions of any black-box structured
              input-structured output model around a specific input-output
              pair. Our method returns an ``explanation''' consisting of groups
              of input-output tokens that are causally related. Our method
              infers these dependencies by querying the model with perturbed
              inputs, generating a graph over tokens from the responses, and
              solving a partitioning problem to select the most relevant
              components. We focus the general approach on sequence-to-sequence
              problems, adopting a variational autoencoder to yield meaningful
              input perturbations. We test our method across several NLP
              sequence generation tasks.",
  journal  = "arXiv [cs.LG]",
  month    =  "6~" # jul,
  year     =  2017
}

@ARTICLE{Ruder2016-hu,
  title         = "An overview of gradient descent optimization algorithms",
  author        = "Ruder, Sebastian",
  abstract      = "Gradient descent optimization algorithms, while increasingly
                   popular, are often used as black-box optimizers, as
                   practical explanations of their strengths and weaknesses are
                   hard to come by. This article aims to provide the reader
                   with intuitions with regard to the behaviour of different
                   algorithms that will allow her to put them to use. In the
                   course of this overview, we look at different variants of
                   gradient descent, summarize challenges, introduce the most
                   common optimization algorithms, review architectures in a
                   parallel and distributed setting, and investigate additional
                   strategies for optimizing gradient descent.",
  month         =  "15~" # sep,
  year          =  2016,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1609.04747"
}

@ARTICLE{Serra2017-rl,
  title         = "Getting deep recommenders fit: Bloom embeddings for sparse
                   binary input/output networks",
  author        = "Serr{\`a}, Joan and Karatzoglou, Alexandros",
  abstract      = "Recommendation algorithms that incorporate techniques from
                   deep learning are becoming increasingly popular. Due to the
                   structure of the data coming from recommendation domains
                   (i.e., one-hot-encoded vectors of item preferences), these
                   algorithms tend to have large input and output
                   dimensionalities that dominate their overall size. This
                   makes them difficult to train, due to the limited memory of
                   graphical processing units, and difficult to deploy on
                   mobile devices with limited hardware. To address these
                   difficulties, we propose Bloom embeddings, a compression
                   technique that can be applied to the input and output of
                   neural network models dealing with sparse high-dimensional
                   binary-coded instances. Bloom embeddings are computationally
                   efficient, and do not seriously compromise the accuracy of
                   the model up to 1/5 compression ratios. In some cases, they
                   even improve over the original accuracy, with relative
                   increases up to 12\%. We evaluate Bloom embeddings on 7 data
                   sets and compare it against 4 alternative methods, obtaining
                   favorable results. We also discuss a number of further
                   advantages of Bloom embeddings, such as 'on-the-fly'
                   constant-time operation, zero or marginal space
                   requirements, training time speedups, or the fact that they
                   do not require any change to the core model architecture or
                   training configuration.",
  month         =  "13~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1706.03993"
}

@ARTICLE{Shwartz-Ziv2017-jx,
  title         = "Opening the Black Box of Deep Neural Networks via
                   Information",
  author        = "Shwartz-Ziv, Ravid and Tishby, Naftali",
  abstract      = "Despite their great success, there is still no comprehensive
                   theoretical understanding of learning with Deep Neural
                   Networks (DNNs) or their inner organization. Previous work
                   proposed to analyze DNNs in the
                   \textbackslashtextit\{Information Plane\}; i.e., the plane
                   of the Mutual Information values that each layer preserves
                   on the input and output variables. They suggested that the
                   goal of the network is to optimize the Information
                   Bottleneck (IB) tradeoff between compression and prediction,
                   successively, for each layer. In this work we follow up on
                   this idea and demonstrate the effectiveness of the
                   Information-Plane visualization of DNNs. Our main results
                   are: (i) most of the training epochs in standard DL are
                   spent on \{\textbackslashemph compression\} of the input to
                   efficient representation and not on fitting the training
                   labels. (ii) The representation compression phase begins
                   when the training errors becomes small and the Stochastic
                   Gradient Decent (SGD) epochs change from a fast drift to
                   smaller training error into a stochastic relaxation, or
                   random diffusion, constrained by the training error value.
                   (iii) The converged layers lie on or very close to the
                   Information Bottleneck (IB) theoretical bound, and the maps
                   from the input to any hidden layer and from this hidden
                   layer to the output satisfy the IB self-consistent
                   equations. This generalization through noise mechanism is
                   unique to Deep Neural Networks and absent in one layer
                   networks. (iv) The training time is dramatically reduced
                   when adding more hidden layers. Thus the main advantage of
                   the hidden layers is computational. This can be explained by
                   the reduced relaxation time, as this it scales
                   super-linearly (exponentially for simple diffusion) with the
                   information compression from the previous layer.",
  month         =  "2~" # mar,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1703.00810"
}

@ARTICLE{Fortunato2017-ij,
  title         = "Noisy Networks for Exploration",
  author        = "Fortunato, Meire and Azar, Mohammad Gheshlaghi and Piot,
                   Bilal and Menick, Jacob and Osband, Ian and Graves, Alex and
                   Mnih, Vlad and Munos, Remi and Hassabis, Demis and Pietquin,
                   Olivier and Blundell, Charles and Legg, Shane",
  abstract      = "We introduce NoisyNet, a deep reinforcement learning agent
                   with parametric noise added to its weights, and show that
                   the induced stochasticity of the agent's policy can be used
                   to aid efficient exploration. The parameters of the noise
                   are learned with gradient descent along with the remaining
                   network weights. NoisyNet is straightforward to implement
                   and adds little computational overhead. We find that
                   replacing the conventional exploration heuristics for A3C,
                   DQN and dueling agents (entropy reward and $\epsilon$-greedy
                   respectively) with NoisyNet yields substantially higher
                   scores for a wide range of Atari games, in some cases
                   advancing the agent from sub to super-human performance.",
  month         =  "30~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1706.10295"
}

@UNPUBLISHED{Beaulieu-Jones2017-qt,
  title    = "Privacy-preserving generative deep neural networks support
              clinical data sharing",
  author   = "Beaulieu-Jones, Brett K and Wu, Zhiwei Steven and Williams, Chris
              and Greene, Casey S",
  abstract = "Though it is widely recognized that data sharing enables faster
              scientific progress, the sensible need to protect participant
              privacy hampers this practice in medicine. We train deep neural
              networks that generate synthetic subjects closely resembling
              study participants. Using the SPRINT trial as an example, we show
              that machine-learning models built from simulated participants
              generalize to the original dataset. We incorporate differential
              privacy, which offers strong guarantees on the likelihood that a
              subject could be identified as a member of the trial.
              Investigators who have compiled a dataset can use our method to
              provide a freely accessible public version that enables other
              scientists to perform discovery-oriented analyses. Generated data
              can be released alongside analytical code to enable fully
              reproducible workflows, even when privacy is a concern. By
              addressing data sharing challenges, deep neural networks can
              facilitate the rigorous and reproducible investigation of
              clinical datasets.",
  journal  = "bioRxiv",
  pages    = "159756",
  month    =  "5~" # jul,
  year     =  2017,
  language = "en"
}

@ARTICLE{Lerer2017-ee,
  title         = "Maintaining cooperation in complex social dilemmas using
                   deep reinforcement learning",
  author        = "Lerer, Adam and Peysakhovich, Alexander",
  abstract      = "In social dilemmas individuals face a temptation to increase
                   their payoffs in the short run at a cost to the long run
                   total welfare. Much is known about how cooperation can be
                   stabilized in the simplest of such settings: repeated
                   Prisoner's Dilemma games. However, there is relatively
                   little work on generalizing these insights to more complex
                   situations. We start to fill this gap by showing how to use
                   modern reinforcement learning methods to generalize a highly
                   successful Prisoner's Dilemma strategy: tit-for-tat. We
                   construct artificial agents that act in ways that are simple
                   to understand, nice (begin by cooperating), provokable (try
                   to avoid being exploited), and forgiving (following a bad
                   turn try to return to mutual cooperation). We show both
                   theoretically and experimentally that generalized
                   tit-for-tat agents can maintain cooperation in more complex
                   environments. In contrast, we show that employing purely
                   reactive training techniques can lead to agents whose
                   behavior results in socially inefficient outcomes.",
  month         =  "4~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1707.01068"
}

@ARTICLE{Carreira-Perpinan2017-if,
  title         = "Model compression as constrained optimization, with
                   application to neural nets. Part I: general framework",
  author        = "Carreira-Perpi{\~n}{\'a}n, Miguel {\'A}",
  abstract      = "Compressing neural nets is an active research problem, given
                   the large size of state-of-the-art nets for tasks such as
                   object recognition, and the computational limits imposed by
                   mobile devices. We give a general formulation of model
                   compression as constrained optimization. This includes many
                   types of compression: quantization, low-rank decomposition,
                   pruning, lossless compression and others. Then, we give a
                   general algorithm to optimize this nonconvex problem based
                   on the augmented Lagrangian and alternating optimization.
                   This results in a ``learning-compression'' algorithm, which
                   alternates a learning step of the uncompressed model,
                   independent of the compression type, with a compression step
                   of the model parameters, independent of the learning task.
                   This simple, efficient algorithm is guaranteed to find the
                   best compressed model for the task in a local sense under
                   standard assumptions. We present separately in several
                   companion papers the development of this general framework
                   into specific algorithms for model compression based on
                   quantization, pruning and other variations, including
                   experimental results on compressing neural nets and other
                   models.",
  month         =  "5~" # jul,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1707.01209"
}

@ARTICLE{noauthor_undated-rc,
  title    = "{GANs} Trained by a Two {Time-Scale} Update Rule Converge to a
              Nash Equilibrium",
  author   = "Heusel, Martin and Ramsauer, Hubert and Unterthiner, Thomas and
              Nessler, Bernhard and Klambauer, G{\"u}nter and Hochreiter, Sepp",
  abstract = "Generative Adversarial Networks (GANs) excel at creating
              realistic images with complex models for which maximum likelihood
              is infeasible. However, the convergence of GAN training has still
              not been proved. We propose a two time-scale update rule (TTUR)
              for training GANs with stochastic gradient descent that has an
              individual learning rate for both the discriminator and the
              generator. We prove that the TTUR converges under mild
              assumptions to a stationary Nash equilibrium. The convergence
              carries over to the popular Adam optimization, for which we prove
              that it follows the dynamics of a heavy ball with friction and
              thus prefers flat minima in the objective landscape. For the
              evaluation of the performance of GANs at image generation, we
              introduce the ``Fr\textbackslash'echet Inception Distance'' (FID)
              which captures the similarity of generated images to real ones
              better than the Inception Score. In experiments, TTUR improves
              learning for DCGANs, improved Wasserstein GANs, and BEGANs,
              outperforming conventional GAN training on CelebA, One Billion
              Word Benchmark, and LSUN bedrooms.",
  journal  = "arXiv [cs.LG]",
  month    =  "26~" # jun,
  year     =  2017
}

@ARTICLE{Selsam2017-dw,
  title         = "Developing {Bug-Free} Machine Learning Systems With Formal
                   Mathematics",
  author        = "Selsam, Daniel and Liang, Percy and Dill, David L",
  abstract      = "Noisy data, non-convex objectives, model misspecification,
                   and numerical instability can all cause undesired behaviors
                   in machine learning systems. As a result, detecting actual
                   implementation errors can be extremely difficult. We
                   demonstrate a methodology in which developers use an
                   interactive proof assistant to both implement their system
                   and to state a formal theorem defining what it means for
                   their system to be correct. The process of proving this
                   theorem interactively in the proof assistant exposes all
                   implementation errors since any error in the program would
                   cause the proof to fail. As a case study, we implement a new
                   system, Certigrad, for optimizing over stochastic
                   computation graphs, and we generate a formal (i.e.
                   machine-checkable) proof that the gradients sampled by the
                   system are unbiased estimates of the true mathematical
                   gradients. We train a variational autoencoder using
                   Certigrad and find the performance comparable to training
                   the same model in TensorFlow.",
  month         =  "26~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.SE",
  eprint        = "1706.08605"
}

@MISC{He_undated-wg,
  title       = "deep\_srl",
  author      = "He, Luheng",
  abstract    = "deep\_srl - Code and pre-trained model for: Deep Semantic Role
                 Labeling: What Works and What's Next",
  institution = "Github"
}

@ARTICLE{noauthor_undated-ms,
  title    = "Natural Language Does Not Emerge 'Naturally' in {Multi-Agent}
              Dialog",
  author   = "Kottur, Satwik and Moura, Jos{\'e} M F and Lee, Stefan and Batra,
              Dhruv",
  abstract = "A number of recent works have proposed techniques for end-to-end
              learning of communication protocols among cooperative multi-agent
              populations, and have simultaneously found the emergence of
              grounded human-interpretable language in the protocols developed
              by the agents, all learned without any human supervision! In this
              paper, using a Task and Tell reference game between two agents as
              a testbed, we present a sequence of 'negative' results
              culminating in a 'positive' one -- showing that while most
              agent-invented languages are effective (i.e. achieve near-perfect
              task rewards), they are decidedly not interpretable or
              compositional. In essence, we find that natural language does not
              emerge 'naturally', despite the semblance of ease of
              natural-language-emergence that one may gather from recent
              literature. We discuss how it is possible to coax the invented
              languages to become more and more human-like and compositional by
              increasing restrictions on how two agents may communicate.",
  journal  = "arXiv [cs.CL]",
  month    =  "26~" # jun,
  year     =  2017
}

@ARTICLE{Borsa2017-ua,
  title         = "Observational Learning by Reinforcement Learning",
  author        = "Borsa, Diana and Piot, Bilal and Munos, R{\'e}mi and
                   Pietquin, Olivier",
  abstract      = "Observational learning is a type of learning that occurs as
                   a function of observing, retaining and possibly replicating
                   or imitating the behaviour of another agent. It is a core
                   mechanism appearing in various instances of social learning
                   and has been found to be employed in several intelligent
                   species, including humans. In this paper, we investigate to
                   what extent the explicit modelling of other agents is
                   necessary to achieve observational learning through machine
                   learning. Especially, we argue that observational learning
                   can emerge from pure Reinforcement Learning (RL),
                   potentially coupled with memory. Through simple scenarios,
                   we demonstrate that an RL agent can leverage the information
                   provided by the observations of an other agent performing a
                   task in a shared environment. The other agent is only
                   observed through the effect of its actions on the
                   environment and never explicitly modeled. Two key aspects
                   are borrowed from observational learning: i) the observer
                   behaviour needs to change as a result of viewing a 'teacher'
                   (another agent) and ii) the observer needs to be motivated
                   somehow to engage in making use of the other agent's
                   behaviour. The later is naturally modeled by RL, by
                   correlating the learning agent's reward with the teacher
                   agent's behaviour.",
  month         =  "20~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1706.06617"
}

@ARTICLE{Denil2017-ec,
  title         = "Programmable Agents",
  author        = "Denil, Misha and Colmenarejo, Sergio G{\'o}mez and Cabi,
                   Serkan and Saxton, David and de Freitas, Nando",
  abstract      = "We build deep RL agents that execute declarative programs
                   expressed in formal language. The agents learn to ground the
                   terms in this language in their environment, and can
                   generalize their behavior at test time to execute new
                   programs that refer to objects that were not referenced
                   during training. The agents develop disentangled
                   interpretable representations that allow them to generalize
                   to a wide variety of zero-shot semantic tasks.",
  month         =  "20~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1706.06383"
}

@ARTICLE{Chelba2017-ij,
  title         = "N-gram Language Modeling using Recurrent Neural Network
                   Estimation",
  author        = "Chelba, Ciprian and Norouzi, Mohammad and Bengio, Samy",
  abstract      = "We investigate the effective memory depth of RNN models by
                   using them for $n$-gram language model (LM) smoothing.
                   Experiments on a small corpus (UPenn Treebank, one million
                   words of training data and 10k vocabulary) have found the
                   LSTM cell with dropout to be the best model for encoding the
                   $n$-gram state when compared with feed-forward and vanilla
                   RNN models. When preserving the sentence independence
                   assumption the LSTM $n$-gram matches the LSTM LM performance
                   for $n=9$ and slightly outperforms it for $n=13$. When
                   allowing dependencies across sentence boundaries, the LSTM
                   $13$-gram almost matches the perplexity of the unlimited
                   history LSTM LM. LSTM $n$-gram smoothing also has the
                   desirable property of improving with increasing $n$-gram
                   order, unlike the Katz or Kneser-Ney back-off estimators.
                   Using multinomial distributions as targets in training
                   instead of the usual one-hot target is only slightly
                   beneficial for low $n$-gram orders. Experiments on the One
                   Billion Words benchmark show that the results hold at larger
                   scale: while LSTM smoothing for short $n$-gram contexts does
                   not provide significant advantages over classic N-gram
                   models, it becomes effective with long contexts ($n > 5$);
                   depending on the task and amount of data it can match fully
                   recurrent LSTM models at about $n=13$. This may have
                   implications when modeling short-format text, e.g. voice
                   search/query LMs. Building LSTM $n$-gram LMs may be
                   appealing for some practical situations: the state in a
                   $n$-gram LM can be succinctly represented with $(n-1)*4$
                   bytes storing the identity of the words in the context and
                   batches of $n$-gram contexts can be processed in parallel.
                   On the downside, the $n$-gram context encoding computed by
                   the LSTM is discarded, making the model more expensive than
                   a regular recurrent LSTM LM.",
  month         =  "31~" # mar,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.10724"
}

@ARTICLE{Yoneda2017-pn,
  title         = "Bib2vec: An Embedding-based Search System for Bibliographic
                   Information",
  author        = "Yoneda, Takuma and Mori, Koki and Miwa, Makoto and Sasaki,
                   Yutaka",
  abstract      = "We propose a novel embedding model that represents
                   relationships among several elements in bibliographic
                   information with high representation ability and
                   flexibility. Based on this model, we present a novel search
                   system that shows the relationships among the elements in
                   the ACL Anthology Reference Corpus. The evaluation results
                   show that our model can achieve a high prediction ability
                   and produce reasonable search results.",
  month         =  "16~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1706.05122"
}

@ARTICLE{Kaiser2017-cp,
  title         = "One Model To Learn Them All",
  author        = "Kaiser, Lukasz and Gomez, Aidan N and Shazeer, Noam and
                   Vaswani, Ashish and Parmar, Niki and Jones, Llion and
                   Uszkoreit, Jakob",
  abstract      = "Deep learning yields great results across many fields, from
                   speech recognition, image classification, to translation.
                   But for each problem, getting a deep model to work well
                   involves research into the architecture and a long period of
                   tuning. We present a single model that yields good results
                   on a number of problems spanning multiple domains. In
                   particular, this single model is trained concurrently on
                   ImageNet, multiple translation tasks, image captioning (COCO
                   dataset), a speech recognition corpus, and an English
                   parsing task. Our model architecture incorporates building
                   blocks from multiple domains. It contains convolutional
                   layers, an attention mechanism, and sparsely-gated layers.
                   Each of these computational blocks is crucial for a subset
                   of the tasks we train on. Interestingly, even if a block is
                   not crucial for a task, we observe that adding it never
                   hurts performance and in most cases improves it on all
                   tasks. We also show that tasks with less data benefit
                   largely from joint training with other tasks, while
                   performance on large tasks degrades only slightly if at all.",
  month         =  "16~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1706.05137"
}

@ARTICLE{Huh2017-uz,
  title         = "Gradient Descent for Spiking Neural Networks",
  author        = "Huh, Dongsung and Sejnowski, Terrence J",
  abstract      = "Much of studies on neural computation are based on network
                   models of static neurons that produce analog output, despite
                   the fact that information processing in the brain is
                   predominantly carried out by dynamic neurons that produce
                   discrete pulses called spikes. Research in spike-based
                   computation has been impeded by the lack of efficient
                   supervised learning algorithm for spiking networks. Here, we
                   present a gradient descent method for optimizing spiking
                   network models by introducing a differentiable formulation
                   of spiking networks and deriving the exact gradient
                   calculation. For demonstration, we trained recurrent spiking
                   networks on two dynamic tasks: one that requires optimizing
                   fast (~millisecond) spike-based interactions for efficient
                   encoding of information, and a delayed memory XOR task over
                   extended duration (~second). The results show that our
                   method indeed optimizes the spiking network dynamics on the
                   time scale of individual spikes as well as behavioral time
                   scales. In conclusion, our result offers a general purpose
                   supervised learning algorithm for spiking neural networks,
                   thus advancing further investigations on spike-based
                   computation.",
  month         =  "14~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "q-bio.NC",
  eprint        = "1706.04698"
}

@ARTICLE{Tan2017-tf,
  title         = "{S-Net}: From Answer Extraction to Answer Generation for
                   Machine Reading Comprehension",
  author        = "Tan, Chuanqi and Wei, Furu and Yang, Nan and Lv, Weifeng and
                   Zhou, Ming",
  abstract      = "Most existing works on machine reading comprehension are
                   built under the answer extraction approach which predicts
                   sub-spans from passages to answer questions. In this paper,
                   we develop an extraction-then-generation framework for
                   machine reading comprehension, in which the answer is
                   generated from the extraction results. Specifically, we
                   build the answer extraction model to predict the most
                   important sub-spans from the passage as evidence, and
                   develop the answer generation model which takes the evidence
                   as additional features along with the question and passage
                   to further elaborate the final answers. We build the answer
                   extraction model with state-of-the-art neural networks for
                   reading comprehension, and the answer generation model with
                   sequence-to-sequence neural networks. Experiments on the
                   MS-MARCO dataset show that the generation based approach
                   achieves better results than pure answer extraction.",
  month         =  "15~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1706.04815"
}

@ARTICLE{Huang2017-ra,
  title         = "Learning Deep {ResNet} Blocks Sequentially using Boosting
                   Theory",
  author        = "Huang, Furong and Ash, Jordan and Langford, John and
                   Schapire, Robert",
  abstract      = "Deep neural networks are known to be difficult to train due
                   to the instability of back-propagation. A deep
                   \textbackslashemph\{residual network\} (ResNet) with
                   identity loops remedies this by stabilizing gradient
                   computations. We prove a boosting theory for the ResNet
                   architecture. We construct $T$ weak module classifiers, each
                   contains two of the $T$ layers, such that the combined
                   strong learner is a ResNet. Therefore, we introduce an
                   alternative Deep ResNet training algorithm,
                   \textbackslashemph\{BoostResNet\}, which is particularly
                   suitable in non-differentiable architectures. Our proposed
                   algorithm merely requires a sequential training of $T$
                   ``shallow ResNets'' which are inexpensive. We prove that the
                   training error decays exponentially with the depth $T$ if
                   the \textbackslashemph\{weak module classifiers\} that we
                   train perform slightly better than some weak baseline. In
                   other words, we propose a weak learning condition and prove
                   a boosting theory for ResNet under the weak learning
                   condition. Our results apply to general multi-class ResNets.
                   A generalization error bound based on margin theory is
                   proved and suggests ResNet's resistant to overfitting under
                   network with $l_1$ norm bounded weights.",
  month         =  "15~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1706.04964"
}

@ARTICLE{Brock2017-hq,
  title         = "{FreezeOut}: Accelerate Training by Progressively Freezing
                   Layers",
  author        = "Brock, Andrew and Lim, Theodore and Ritchie, J M and Weston,
                   Nick",
  abstract      = "The early layers of a deep neural net have the fewest
                   parameters, but take up the most computation. In this
                   extended abstract, we propose to only train the hidden
                   layers for a set portion of the training run, freezing them
                   out one-by-one and excluding them from the backward pass. We
                   empirically demonstrate that FreezeOut yields savings of up
                   to 20\% wall-clock time during training with 3\% loss in
                   accuracy for DenseNets on CIFAR.",
  month         =  "15~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1706.04983"
}

@MISC{noauthor_undated-nz,
  title       = "{LatentlyDeepLearningCertificate}",
  abstract    = "LatentlyDeepLearningCertificate - Latently Deep Learning
                 Certificate - Publicly Replicate all AI and ML Scientific
                 Papers and Patents",
  institution = "Github"
}

@ARTICLE{Vaswani2017-jd,
  title         = "Attention Is All You Need",
  author        = "Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
                   Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and
                   Kaiser, Lukasz and Polosukhin, Illia",
  abstract      = "The dominant sequence transduction models are based on
                   complex recurrent or convolutional neural networks in an
                   encoder-decoder configuration. The best performing models
                   also connect the encoder and decoder through an attention
                   mechanism. We propose a new simple network architecture, the
                   Transformer, based solely on attention mechanisms,
                   dispensing with recurrence and convolutions entirely.
                   Experiments on two machine translation tasks show these
                   models to be superior in quality while being more
                   parallelizable and requiring significantly less time to
                   train. Our model achieves 28.4 BLEU on the WMT 2014
                   English-to-German translation task, improving over the
                   existing best results, including ensembles by over 2 BLEU.
                   On the WMT 2014 English-to-French translation task, our
                   model establishes a new single-model state-of-the-art BLEU
                   score of 41.0 after training for 3.5 days on eight GPUs, a
                   small fraction of the training costs of the best models from
                   the literature. We show that the Transformer generalizes
                   well to other tasks by applying it successfully to English
                   constituency parsing both with large and limited training
                   data.",
  month         =  "12~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1706.03762"
}

@ARTICLE{Klambauer2017-mf,
  title         = "{Self-Normalizing} Neural Networks",
  author        = "Klambauer, G{\"u}nter and Unterthiner, Thomas and Mayr,
                   Andreas and Hochreiter, Sepp",
  abstract      = "Deep Learning has revolutionized vision via convolutional
                   neural networks (CNNs) and natural language processing via
                   recurrent neural networks (RNNs). However, success stories
                   of Deep Learning with standard feed-forward neural networks
                   (FNNs) are rare. FNNs that perform well are typically
                   shallow and, therefore cannot exploit many levels of
                   abstract representations. We introduce self-normalizing
                   neural networks (SNNs) to enable high-level abstract
                   representations. While batch normalization requires explicit
                   normalization, neuron activations of SNNs automatically
                   converge towards zero mean and unit variance. The activation
                   function of SNNs are ``scaled exponential linear units''
                   (SELUs), which induce self-normalizing properties. Using the
                   Banach fixed-point theorem, we prove that activations close
                   to zero mean and unit variance that are propagated through
                   many network layers will converge towards zero mean and unit
                   variance -- even under the presence of noise and
                   perturbations. This convergence property of SNNs allows to
                   (1) train deep networks with many layers, (2) employ strong
                   regularization, and (3) to make learning highly robust.
                   Furthermore, for activations not close to unit variance, we
                   prove an upper and lower bound on the variance, thus,
                   vanishing and exploding gradients are impossible. We
                   compared SNNs on (a) 121 tasks from the UCI machine learning
                   repository, on (b) drug discovery benchmarks, and on (c)
                   astronomy tasks with standard FNNs and other machine
                   learning methods such as random forests and support vector
                   machines. SNNs significantly outperformed all competing FNN
                   methods at 121 UCI tasks, outperformed all competing methods
                   at the Tox21 dataset, and set a new record at an astronomy
                   data set. The winning SNN architectures are often very deep.
                   Implementations are available at:
                   github.com/bioinf-jku/SNNs.",
  month         =  "8~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1706.02515"
}

@ARTICLE{Wu2016-em,
  title         = "Wider or Deeper: Revisiting the {ResNet} Model for Visual
                   Recognition",
  author        = "Wu, Zifeng and Shen, Chunhua and van den Hengel, Anton",
  abstract      = "The trend towards increasingly deep neural networks has been
                   driven by a general observation that increasing depth
                   increases the performance of a network. Recently, however,
                   evidence has been amassing that simply increasing depth may
                   not be the best way to increase performance, particularly
                   given other limitations. Investigations into deep residual
                   networks have also suggested that they may not in fact be
                   operating as a single deep network, but rather as an
                   ensemble of many relatively shallow networks. We examine
                   these issues, and in doing so arrive at a new interpretation
                   of the unravelled view of deep residual networks which
                   explains some of the behaviours that have been observed
                   experimentally. As a result, we are able to derive a new,
                   shallower, architecture of residual networks which
                   significantly outperforms much deeper models such as
                   ResNet-200 on the ImageNet classification dataset. We also
                   show that this performance is transferable to other problem
                   domains by developing a semantic segmentation approach which
                   outperforms the state-of-the-art by a remarkable margin on
                   datasets including PASCAL VOC, PASCAL Context, and
                   Cityscapes. The architecture that we propose thus
                   outperforms its comparators, including very deep ResNets,
                   and yet is more efficient in memory use and sometimes also
                   in training time. The code and models are available at
                   https://github.com/itijyou/ademxapp",
  month         =  "30~" # nov,
  year          =  2016,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1611.10080"
}

@ARTICLE{Du2017-lb,
  title         = "Gradient Descent Can Take Exponential Time to Escape Saddle
                   Points",
  author        = "Du, Simon S and Jin, Chi and Lee, Jason D and Jordan,
                   Michael I and Poczos, Barnabas and Singh, Aarti",
  abstract      = "Although gradient descent (GD) almost always escapes saddle
                   points asymptotically [Lee et al., 2016], this paper shows
                   that even with fairly natural random initialization schemes
                   and non-pathological functions, GD can be significantly
                   slowed down by saddle points, taking exponential time to
                   escape. On the other hand, gradient descent with
                   perturbations [Ge et al., 2015, Jin et al., 2017] is not
                   slowed down by saddle points - it can find an approximate
                   local minimizer in polynomial time. This result implies that
                   GD is inherently slower than perturbed GD, and justifies the
                   importance of adding perturbations for efficient non-convex
                   optimization. While our focus is theoretical, we also
                   present experiments that illustrate our theoretical
                   findings.",
  month         =  "29~" # may,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "math.OC",
  eprint        = "1705.10412"
}

@ARTICLE{Spring2016-dc,
  title         = "Scalable and Sustainable Deep Learning via Randomized
                   Hashing",
  author        = "Spring, Ryan and Shrivastava, Anshumali",
  abstract      = "Current deep learning architectures are growing larger in
                   order to learn from complex datasets. These architectures
                   require giant matrix multiplication operations to train
                   millions of parameters. Conversely, there is another growing
                   trend to bring deep learning to low-power, embedded devices.
                   The matrix operations, associated with both training and
                   testing of deep networks, are very expensive from a
                   computational and energy standpoint. We present a novel
                   hashing based technique to drastically reduce the amount of
                   computation needed to train and test deep networks. Our
                   approach combines recent ideas from adaptive dropouts and
                   randomized hashing for maximum inner product search to
                   select the nodes with the highest activation efficiently.
                   Our new algorithm for deep learning reduces the overall
                   computational cost of forward and back-propagation by
                   operating on significantly fewer (sparse) nodes. As a
                   consequence, our algorithm uses only 5\% of the total
                   multiplications, while keeping on average within 1\% of the
                   accuracy of the original model. A unique property of the
                   proposed hashing based back-propagation is that the updates
                   are always sparse. Due to the sparse gradient updates, our
                   algorithm is ideally suited for asynchronous and parallel
                   training leading to near linear speedup with increasing
                   number of cores. We demonstrate the scalability and
                   sustainability (energy efficiency) of our proposed algorithm
                   via rigorous experimental evaluations on several real
                   datasets.",
  month         =  "26~" # feb,
  year          =  2016,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1602.08194"
}

@ARTICLE{Sugathadasa2017-xj,
  title         = "Synergistic Union of {Word2Vec} and Lexicon for Domain
                   Specific Semantic Similarity",
  author        = "Sugathadasa, Keet and Ayesha, Buddhi and de Silva, Nisansa
                   and Perera, Amal Shehan and Jayawardana, Vindula and Lakmal,
                   Dimuthu and Perera, Madhavi",
  abstract      = "Semantic similarity measures are an important part in
                   Natural Language Processing tasks. However Semantic
                   similarity measures built for general use do not perform
                   well within specific domains. Therefore in this study we
                   introduce a domain specific semantic similarity measure that
                   was created by the synergistic union of word2vec, a word
                   embedding method that is used for semantic similarity
                   calculation and lexicon based (lexical) semantic similarity
                   methods. We prove that this proposed methodology out
                   performs word embedding methods trained on generic corpus
                   and methods trained on domain specific corpus but do not use
                   lexical semantic similarity methods to augment the results.
                   Further, we prove that text lemmatization can improve the
                   performance of word embedding methods.",
  month         =  "6~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1706.01967"
}

@ARTICLE{Santoro2017-qk,
  title         = "A simple neural network module for relational reasoning",
  author        = "Santoro, Adam and Raposo, David and Barrett, David G T and
                   Malinowski, Mateusz and Pascanu, Razvan and Battaglia, Peter
                   and Lillicrap, Timothy",
  abstract      = "Relational reasoning is a central component of generally
                   intelligent behavior, but has proven difficult for neural
                   networks to learn. In this paper we describe how to use
                   Relation Networks (RNs) as a simple plug-and-play module to
                   solve problems that fundamentally hinge on relational
                   reasoning. We tested RN-augmented networks on three tasks:
                   visual question answering using a challenging dataset called
                   CLEVR, on which we achieve state-of-the-art, super-human
                   performance; text-based question answering using the bAbI
                   suite of tasks; and complex reasoning about dynamic physical
                   systems. Then, using a curated dataset called Sort-of-CLEVR
                   we show that powerful convolutional networks do not have a
                   general capacity to solve relational questions, but can gain
                   this capacity when augmented with RNs. Our work shows how a
                   deep learning architecture equipped with an RN module can
                   implicitly discover and learn to reason about entities and
                   their relations.",
  month         =  "5~" # jun,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1706.01427"
}

@ARTICLE{Hohenecker2017-pp,
  title         = "Deep Learning for Ontology Reasoning",
  author        = "Hohenecker, Patrick and Lukasiewicz, Thomas",
  abstract      = "In this work, we present a novel approach to ontology
                   reasoning that is based on deep learning rather than
                   logic-based formal reasoning. To this end, we introduce a
                   new model for statistical relational learning that is built
                   upon deep recursive neural networks, and give experimental
                   evidence that it can easily compete with, or even
                   outperform, existing logic-based reasoners on the task of
                   ontology reasoning. More precisely, we compared our
                   implemented system with one of the best logic-based ontology
                   reasoners at present, RDFox, on a number of large standard
                   benchmark datasets, and found that our system attained high
                   reasoning quality, while being up to two orders of magnitude
                   faster.",
  month         =  "29~" # may,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1705.10342"
}

@ARTICLE{noauthor_undated-im,
  title    = "Contextual Explanation Networks",
  author   = "Al-Shedivat, Maruan and Dubey, Avinava and Xing, Eric P",
  abstract = "We introduce contextual explanation networks (CENs)---a class of
              models that learn to predict by generating and leveraging
              intermediate explanations. CENs combine deep networks with
              context-specific probabilistic models and construct explanations
              in the form of locally-correct hypotheses. Contrary to the
              existing post-hoc model-explanation tools, CENs learn to predict
              and to explain jointly. Our approach offers two major advantages:
              (i) for each prediction, valid instance-specific explanations are
              generated with no computational overhead and (ii) prediction via
              explanation acts as a regularization and boosts performance in
              low-resource settings. We prove that local approximations to the
              decision boundary of our networks are consistent with the
              generated explanations. Our results on image and text
              classification and survival analysis tasks demonstrate that CENs
              can easily match or outperform the state-of-the-art while
              offering additional insights behind each prediction, valuable for
              decision support.",
  journal  = "arXiv [cs.LG]",
  month    =  "29~" # may,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Gentile2016-jh,
  title         = "On {Context-Dependent} Clustering of Bandits",
  author        = "Gentile, Claudio and Li, Shuai and Kar, Purushottam and
                   Karatzoglou, Alexandros and Etrue, Evans and Zappella,
                   Giovanni",
  abstract      = "We investigate a novel cluster-of-bandit algorithm CAB for
                   collaborative recommendation tasks that implements the
                   underlying feedback sharing mechanism by estimating the
                   neighborhood of users in a context-dependent manner. CAB
                   makes sharp departures from the state of the art by
                   incorporating collaborative effects into inference as well
                   as learning processes in a manner that seamlessly
                   interleaving explore-exploit tradeoffs and collaborative
                   steps. We prove regret bounds under various assumptions on
                   the data, which exhibit a crisp dependence on the expected
                   number of clusters over the users, a natural measure of the
                   statistical difficulty of the learning task. Experiments on
                   production and real-world datasets show that CAB offers
                   significantly increased prediction performance against a
                   representative pool of state-of-the-art methods.",
  month         =  "6~" # aug,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1608.03544"
}

@ARTICLE{Palangi2017-aq,
  title         = "Deep Learning of {Grammatically-Interpretable}
                   Representations Through {Question-Answering}",
  author        = "Palangi, Hamid and Smolensky, Paul and He, Xiaodong and
                   Deng, Li",
  abstract      = "We introduce an architecture in which internal
                   representations, learned by end-to-end optimization in a
                   deep neural network performing a textual question-answering
                   task, can be interpreted using basic concepts from
                   linguistic theory. This interpretability comes at a cost of
                   only a few percentage-point reduction in accuracy relative
                   to the original model on which the new one is based (BiDAF
                   [1]). The internal representation that is interpreted is a
                   Tensor Product Representation: for each input word, the
                   model selects a symbol to encode the word, and a role in
                   which to place the symbol, and binds the two together. The
                   selection is via soft attention. The overall interpretation
                   is built from interpretations of the symbols, as recruited
                   by the trained model, and interpretations of the roles as
                   used by the model. We find support for our initial
                   hypothesis that symbols can be interpreted as
                   lexical-semantic word meanings, while roles can be
                   interpreted as approximations of grammatical roles (or
                   categories) such as subject, wh-word, determiner, etc.
                   Through extremely detailed, fine-grained analysis, we find
                   specific correspondences between the learned roles and parts
                   of speech as assigned by a standard parser [2], and find
                   several discrepancies in the model's favor. In this sense,
                   the model learns significant aspects of grammar, after
                   having been exposed solely to linguistically unannotated
                   text, questions, and answers: no prior linguistic knowledge
                   is given to the model. What is given is the means to
                   represent using symbols and roles and an inductive bias
                   favoring use of these in an approximately discrete manner.",
  month         =  "23~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1705.08432"
}

@ARTICLE{noauthor_undated-ws,
  title    = "Adversarial Examples Are Not Easily Detected: Bypassing Ten
              Detection Methods",
  author   = "Carlini, Nicholas and Wagner, David",
  abstract = "Neural networks are known to be vulnerable to adversarial
              examples: inputs that are close to valid inputs but classified
              incorrectly. We investigate the security of ten recent proposals
              that are designed to detect adversarial examples. We show that
              all can be defeated, even when the adversary does not know the
              exact parameters of the detector. We conclude that adversarial
              examples are significantly harder to detect than previously
              appreciated, and we propose several guidelines for evaluating
              future proposed defenses.",
  journal  = "arXiv [cs.LG]",
  month    =  "20~" # may,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Grafton1995-xx,
  title       = "Functional mapping of sequence learning in normal humans",
  author      = "Grafton, S T and Hazeltine, E and Ivry, R",
  affiliation = "University of Southern California.",
  abstract    = "The brain localization of motor sequence learning was studied
                 in normal subjects with positron emission tomography. Subjects
                 performed a serial reaction time (SRT) task by responding to a
                 series of stimuli that occurred at four different spatial
                 positions. The stimulus locations were either determined
                 randomly or according to a 6-element sequence that cycled
                 continuously. The SRT task was performed under two conditions.
                 With attentional interference from a secondary counting task
                 there was no development of awareness of the sequence.
                 Learning-related increases of cerebral blood flow were located
                 in contralateral motor effector areas including motor cortex,
                 supplementary motor area, and putamen, consistent with the
                 hypothesis that nondeclarative motor learning occurs in
                 cerebral areas that control limb movements. Additional
                 cortical sites included the rostral prefrontal cortex and
                 parietal cortex. The SRT learning task was then repeated with
                 a new sequence and no attentional interference. In this
                 condition, 7 of 12 subjects developed awareness of the
                 sequence. Learning-related blood flow increases were present
                 in right dorsolateral prefrontal cortex, right premotor
                 cortex, right ventral putamen, and biparieto-occipital cortex.
                 The right dorsolateral prefrontal and parietal areas have been
                 previously implicated in spatial working memory and right
                 prefrontal cortex is also implicated in retrieval tasks of
                 verbal episodic memory. Awareness of the sequence at the end
                 of learning was associated with greater activity in bilateral
                 parietal, superior temporal, and right premotor cortex. Motor
                 learning can take place in different cerebral areas,
                 contingent on the attentional demands of the task.",
  journal     = "J. Cogn. Neurosci.",
  volume      =  7,
  number      =  4,
  pages       = "497--510",
  year        =  1995,
  keywords    = "Latently",
  language    = "en"
}

@ARTICLE{Orabona2017-qv,
  title         = "Backprop without Learning Rates Through Coin Betting",
  author        = "Orabona, Francesco and Tommasi, Tatiana",
  abstract      = "Deep learning methods achieve state-of-the-art performance
                   in many application scenarios. Yet, these methods require a
                   significant amount of hyperparameters tuning in order to
                   achieve the best results. In particular, tuning the learning
                   rates in the stochastic optimization process is still one of
                   the main bottlenecks. In this paper, we propose a new
                   stochastic gradient descent procedure for deep networks that
                   does not require any learning rate setting. Contrary to
                   previous methods, we do not adapt the learning rates nor we
                   make use of the assumed curvature of the objective function.
                   Instead, we reduce the optimization process to a game of
                   betting on a coin and propose a learning rate free optimal
                   algorithm for this scenario. Theoretical convergence is
                   proven for convex and quasi-convex functions and empirical
                   evidence shows the advantage of our algorithm over popular
                   stochastic gradient algorithms.",
  month         =  "22~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1705.07795"
}

@ARTICLE{Lee2017-wl,
  title         = "Transfer Learning for {Named-Entity} Recognition with Neural
                   Networks",
  author        = "Lee, Ji Young and Dernoncourt, Franck and Szolovits, Peter",
  abstract      = "Recent approaches based on artificial neural networks (ANNs)
                   have shown promising results for named-entity recognition
                   (NER). In order to achieve high performances, ANNs need to
                   be trained on a large labeled dataset. However, labels might
                   be difficult to obtain for the dataset on which the user
                   wants to perform NER: label scarcity is particularly
                   pronounced for patient note de-identification, which is an
                   instance of NER. In this work, we analyze to what extent
                   transfer learning may address this issue. In particular, we
                   demonstrate that transferring an ANN model trained on a
                   large labeled dataset to another dataset with a limited
                   number of labels improves upon the state-of-the-art results
                   on two different datasets for patient note
                   de-identification.",
  month         =  "17~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1705.06273"
}

@ARTICLE{noauthor_undated-za,
  title    = "Predictive Business Process Monitoring with {LSTM} Neural
              Networks",
  author   = "Tax, Niek and Verenich, Ilya and La Rosa, Marcello and Dumas,
              Marlon",
  abstract = "Predictive business process monitoring methods exploit logs of
              completed cases of a process in order to make predictions about
              running cases thereof. Existing methods in this space are
              tailor-made for specific prediction tasks. Moreover, their
              relative accuracy is highly sensitive to the dataset at hand,
              thus requiring users to engage in trial-and-error and tuning when
              applying them in a specific setting. This paper investigates Long
              Short-Term Memory (LSTM) neural networks as an approach to build
              consistently accurate models for a wide range of predictive
              process monitoring tasks. First, we show that LSTMs outperform
              existing techniques to predict the next event of a running case
              and its timestamp. Next, we show how to use models for predicting
              the next task in order to predict the full continuation of a
              running case. Finally, we apply the same approach to predict the
              remaining time, and show that this approach outperforms existing
              tailor-made methods.",
  journal  = "arXiv [stat.AP]",
  month    =  "7~" # dec,
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{noauthor_undated-zm,
  title    = "Snapshot Ensembles: Train 1, get {M} for free",
  author   = "Huang, Gao and Li, Yixuan and Pleiss, Geoff and Liu, Zhuang and
              Hopcroft, John E and Weinberger, Kilian Q",
  abstract = "Ensembles of neural networks are known to be much more robust and
              accurate than individual networks. However, training multiple
              deep networks for model averaging is computationally expensive.
              In this paper, we propose a method to obtain the seemingly
              contradictory goal of ensembling multiple neural networks at no
              additional training cost. We achieve this goal by training a
              single neural network, converging to several local minima along
              its optimization path and saving the model parameters. To obtain
              repeated rapid convergence, we leverage recent work on cyclic
              learning rate schedules. The resulting technique, which we refer
              to as Snapshot Ensembling, is simple, yet surprisingly effective.
              We show in a series of experiments that our approach is
              compatible with diverse network architectures and learning tasks.
              It consistently yields lower error rates than state-of-the-art
              single models at no additional training cost, and compares
              favorably with traditional network ensembles. On CIFAR-10 and
              CIFAR-100 our DenseNet Snapshot Ensembles obtain error rates of
              3.4\% and 17.4\% respectively.",
  journal  = "arXiv [cs.LG]",
  month    =  "1~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Prezza2017-ns,
  title    = "Compressed Computation for Text Indexing",
  author   = "Prezza, Nicola",
  abstract = "Official Full-Text Publication: Compressed Computation for Text
              Indexing on ResearchGate, the professional network for
              scientists.",
  month    =  "2~" # jan,
  year     =  2017,
  keywords = "Latently"
}

@INCOLLECTION{Munro2017-yr,
  title     = "{Space-Efficient} Construction of Compressed Indexes in
               Deterministic Linear Time",
  booktitle = "Proceedings of the {Twenty-Eighth} Annual {ACM-SIAM} Symposium
               on Discrete Algorithms",
  author    = "Munro, J and Navarro, G and Nekrich, Y",
  abstract  = "Abstract We show that the compressed suffix array and the
               compressed suffix tree of a string T can be built in O(n)
               deterministic time using O(n log $\sigma$) bits of space, where
               n is the string length and $\sigma$ is the alphabet size.
               Previously described deterministic algorithms either run in time
               that depends on the alphabet size or need ?(n log $\sigma$) bits
               of working space. Our result has immediate applications to other
               problems, such as yielding the first deterministic linear-time
               LZ77 and LZ78 parsing algorithms that use O(n log $\sigma$)
               bits.",
  publisher = "Society for Industrial and Applied Mathematics",
  pages     = "408--424",
  series    = "Proceedings",
  month     =  "1~" # jan,
  year      =  2017,
  keywords  = "Latently"
}

@ARTICLE{Scardapane2016-km,
  title         = "Learning activation functions from data using cubic spline
                   interpolation",
  author        = "Scardapane, Simone and Scarpiniti, Michele and Comminiello,
                   Danilo and Uncini, Aurelio",
  abstract      = "Neural networks require a careful design in order to perform
                   properly on a given task. In particular, selecting a good
                   activation function (possibly in a data-dependent fashion)
                   is a crucial step, which remains an open problem in the
                   research community. Despite a large amount of
                   investigations, most current implementations simply select
                   one fixed function from a small set of candidates, which is
                   not adapted during training, and is shared among all neurons
                   throughout the different layers. However, neither two of
                   these assumptions can be supposed optimal in practice. In
                   this paper, we present a principled way to have
                   data-dependent adaptation of the activation functions, which
                   is performed independently for each neuron. This is achieved
                   by leveraging over past and present advances on cubic spline
                   interpolation, allowing for local adaptation of the
                   functions around their regions of use. The resulting
                   algorithm is relatively cheap to implement, and overfitting
                   is counterbalanced by the inclusion of a novel damping
                   criterion, which penalizes unwanted oscillations from a
                   predefined shape. Experimental results validate the proposal
                   over two well-known benchmarks.",
  month         =  "18~" # may,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1605.05509"
}

@MISC{noauthor_undated-jp,
  title       = "{LatentlyDeepLearningCertificate}",
  abstract    = "LatentlyDeepLearningCertificate - Latently Deep Learning
                 Certificate - Publicly Replicate all AI and ML Scientific
                 Papers and Patents",
  institution = "Github",
  keywords    = "Latently"
}

@ARTICLE{Perrard2016-sf,
  title       = "{Wave-Based} Turing Machine: Time Reversal and Information
                 Erasing",
  author      = "Perrard, S and Fort, E and Couder, Y",
  affiliation = "Laboratoire Mati{\`e}re et Syst{\`e}mes Complexes,
                 Universit{\'e} Paris Diderot, CNRS UMR 7057, B{\^a}timent
                 Condorcet, 10 rue Alice Domon et L{\'e}onie Duquet, 75013
                 Paris, France. Institut Langevin, ESPCI Paris, PSL Research
                 University, CNRS UMR 7587, 1 rue Jussieu, 75238 Paris, France.
                 Laboratoire Mati{\`e}re et Syst{\`e}mes Complexes,
                 Universit{\'e} Paris Diderot, CNRS UMR 7057, B{\^a}timent
                 Condorcet, 10 rue Alice Domon et L{\'e}onie Duquet, 75013
                 Paris, France.",
  abstract    = "The investigation of dynamical systems has revealed a
                 deep-rooted difference between waves and objects regarding
                 temporal reversibility and particlelike objects. In
                 nondissipative chaos, the dynamic of waves always remains time
                 reversible, unlike that of particles. Here, we explore the
                 dynamics of a wave-particle entity. It consists in a drop
                 bouncing on a vibrated liquid bath, self-propelled and piloted
                 by the surface waves it generates. This walker, in which there
                 is an information exchange between the particle and the wave,
                 can be analyzed in terms of a Turing machine with waves as the
                 information repository. The experiments reveal that in this
                 system, the drop can read information backwards while erasing
                 it. The drop can thus backtrack on its previous trajectory. A
                 transient temporal reversibility, restricted to the drop
                 motion, is obtained in spite of the system being both
                 dissipative and chaotic.",
  journal     = "Phys. Rev. Lett.",
  volume      =  117,
  number      =  9,
  pages       = "094502",
  month       =  "26~" # aug,
  year        =  2016,
  keywords    = "Latently",
  language    = "en"
}

@ARTICLE{Way2017-kv,
  title         = "Wright meets Markowitz: How standard portfolio theory
                   changes when assets are technologies following experience
                   curves",
  author        = "Way, Rupert and Lafond, Fran{\c c}ois and Doyne Farmer, J
                   and Lillo, Fabrizio and Panchenko, Valentyn",
  abstract      = "This paper considers how to optimally allocate investments
                   in a portfolio of competing technologies. We introduce a
                   simple model representing the underlying trade-off - between
                   investing enough effort in any one project to spur rapid
                   progress, and diversifying effort over many projects
                   simultaneously to hedge against failure. We use stochastic
                   experience curves to model the idea that investing more in a
                   technology reduces its unit costs, and we use a
                   mean-variance objective function to understand the effects
                   of risk aversion. In contrast to portfolio theory for
                   standard financial assets, the feedback from the experience
                   curves results in multiple local optima of the objective
                   function, so different optimal portfolios may exist
                   simultaneously. We study the two-technology case and
                   characterize the optimal diversification as a function of
                   relative progress rates, variability, initial cost and
                   experience, risk aversion and total demand. There are
                   critical regions of the parameter space in which the
                   globally optimal portfolio changes sharply from one local
                   minimum to another, even though the underlying parameters
                   change only marginally, so a good understanding of the
                   parameter space is essential. We use the efficient frontier
                   framework to visualize technology portfolios and show that
                   the feedback leads to nonlinear distortions of the feasible
                   set.",
  month         =  "9~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "q-fin.EC",
  eprint        = "1705.03423"
}

@ARTICLE{Cohen2017-ou,
  title         = "Analysis and Design of Convolutional Networks via
                   Hierarchical Tensor Decompositions",
  author        = "Cohen, Nadav and Sharir, Or and Levine, Yoav and Tamari,
                   Ronen and Yakira, David and Shashua, Amnon",
  abstract      = "The driving force behind convolutional networks~--~the most
                   successful deep learning architecture to date, is their
                   expressive power. Despite its wide acceptance and vast
                   empirical evidence, formal analyses supporting this belief
                   are scarce. The primary notions for formally reasoning about
                   expressiveness are efficiency and inductive bias. Expressive
                   efficiency refers to the ability of a network architecture
                   to realize functions that require an alternative
                   architecture to be much larger. Inductive bias refers to the
                   prioritization of some functions over others given prior
                   knowledge regarding a task at hand. In this paper we present
                   a high-level overview of a series of works written by the
                   authors, that through an equivalence to hierarchical tensor
                   decompositions, analyze the expressive efficiency and
                   inductive bias of various architectural features in
                   convolutional networks (depth, width, convolution strides
                   and more). The results presented shed light on the
                   demonstrated effectiveness of convolutional networks, and in
                   addition, provide new tools for network design.",
  month         =  "5~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1705.02302"
}

@ARTICLE{noauthor_undated-ky,
  title    = "convolutional-sequence-to-sequence-learning.pdf",
  keywords = "Latently"
}

@ARTICLE{noauthor_undated-ij,
  title    = "Geometric {GAN}",
  author   = "Lim, Jae Hyun and Ye, Jong Chul",
  abstract = "Generative Adversarial Nets (GANs) represent an important
              milestone for effective generative models, which has inspired
              numerous variants seemingly different from each other. One of the
              main contributions of this paper is to reveal a unified geometric
              structure in GAN and its variants. Specifically, we show that the
              adversarial generative model training can be decomposed into
              three geometric steps: separating hyperplane search,
              discriminator parameter update away from the separating
              hyperplane, and the generator update along the normal vector
              direction of the separating hyperplane. This geometric intuition
              reveals the limitations of the existing approaches and leads us
              to propose a new formulation called geometric GAN using SVM
              separating hyperplane that maximizes the margin. Our theoretical
              analysis shows that the geometric GAN converges to a Nash
              equilibrium between the discriminator and generator. In addition,
              extensive numerical results show that the superior performance of
              geometric GAN.",
  journal  = "arXiv [stat.ML]",
  month    =  "8~" # may,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Yamada2017-fw,
  title         = "Learning Distributed Representations of Texts and Entities
                   from Knowledge Base",
  author        = "Yamada, Ikuya and Shindo, Hiroyuki and Takeda, Hideaki and
                   Takefuji, Yoshiyasu",
  abstract      = "We describe a neural network model that jointly learns
                   distributed representations of texts and knowledge base (KB)
                   entities. Given a text in the KB, we train our proposed
                   model to predict entities that are relevant to the text. Our
                   model is designed to be generic with the ability to address
                   various NLP tasks with ease. We train the model using a
                   large corpus of texts and their entity annotations extracted
                   from Wikipedia. We evaluated the model on three important
                   NLP tasks (i.e., sentence textual similarity, entity
                   linking, and factoid question answering) involving both
                   unsupervised and supervised settings. As a result, we
                   achieved state-of-the-art results on all three of these
                   tasks.",
  month         =  "6~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1705.02494"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{Google_undated-kf,
  title        = "Announcing {SyntaxNet}: The World’s Most Accurate Parser Goes
                  Open Source",
  booktitle    = "Research Blog",
  author       = "{Google}",
  abstract     = "Posted by Slav Petrov, Senior Staff Research Scientist At
                  Google, we spend a lot of time thinking about how computer
                  systems can read an...",
  howpublished = "\url{https://research.googleblog.com/2016/05/announcing-syntaxnet-worlds-most.html}",
  note         = "Accessed: 2017-5-8",
  keywords     = "Latently"
}

@ARTICLE{Brarda2017-fw,
  title         = "Sequential Attention",
  author        = "Brarda, Sebastian and Yeres, Philip and Bowman, Samuel R",
  abstract      = "In this paper we propose a neural network model with a novel
                   Sequential Attention layer that extends soft attention by
                   assigning weights to words in an input sequence in a way
                   that takes into account not just how well that word matches
                   a query, but how well surrounding words match. We evaluate
                   this approach on the task of reading comprehension (Who did
                   What and CNN datasets) and show that it dramatically
                   improves a strong baseline like the Stanford Reader. The
                   resulting model is competitive with the state of the art.",
  month         =  "5~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1705.02269"
}

@ARTICLE{Kumar2015-uf,
  title         = "Ask Me Anything: Dynamic Memory Networks for Natural
                   Language Processing",
  author        = "Kumar, Ankit and Irsoy, Ozan and Ondruska, Peter and Iyyer,
                   Mohit and Bradbury, James and Gulrajani, Ishaan and Zhong,
                   Victor and Paulus, Romain and Socher, Richard",
  abstract      = "Most tasks in natural language processing can be cast into
                   question answering (QA) problems over language input. We
                   introduce the dynamic memory network (DMN), a neural network
                   architecture which processes input sequences and questions,
                   forms episodic memories, and generates relevant answers.
                   Questions trigger an iterative attention process which
                   allows the model to condition its attention on the inputs
                   and the result of previous iterations. These results are
                   then reasoned over in a hierarchical recurrent sequence
                   model to generate answers. The DMN can be trained end-to-end
                   and obtains state-of-the-art results on several types of
                   tasks and datasets: question answering (Facebook's bAbI
                   dataset), text classification for sentiment analysis
                   (Stanford Sentiment Treebank) and sequence modeling for
                   part-of-speech tagging (WSJ-PTB). The training for these
                   different tasks relies exclusively on trained word vector
                   representations and input-question-answer triplets.",
  month         =  "24~" # jun,
  year          =  2015,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1506.07285"
}

@ARTICLE{noauthor_undated-ae,
  title    = "Rational ignorance: simpler models learn more from finite data",
  author   = "Mattingly, Henry H and Transtrum, Mark K and Abbott, Michael C
              and Machta, Benjamin B",
  abstract = "We use the language of uninformative Bayesian prior choice to
              study the selection of appropriately simple effective models. We
              consider the prior which maximizes the mutual information between
              parameters and predictions, learning as much as possible from
              finite data. When many parameters are poorly constrained by data,
              this puts weight only on boundaries of the parameter manifold.
              Thus it selects a lower-dimensional effective theory in a
              principled way, ignoring irrelevant parameter directions. Only in
              the limit where there is sufficient data to tightly constrain all
              parameters do we recover Jeffreys prior.",
  journal  = "arXiv [physics.data-an]",
  month    =  "2~" # may,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Geiger_undated-ct,
  title    = "Reading Comprehension with Deep Learning",
  author   = "Geiger, Atticus and Liu, Dylan",
  keywords = "Latently"
}

@ARTICLE{Henderson2017-as,
  title         = "Efficient Natural Language Response Suggestion for Smart
                   Reply",
  author        = "Henderson, Matthew and Al-Rfou, Rami and Strope, Brian and
                   Sung, Yun-Hsuan and Lukacs, Laszlo and Guo, Ruiqi and Kumar,
                   Sanjiv and Miklos, Balint and Kurzweil, Ray",
  abstract      = "This paper presents a computationally efficient
                   machine-learned method for natural language response
                   suggestion. Feed-forward neural networks using n-gram
                   embedding features encode messages into vectors which are
                   optimized to give message-response pairs a high dot-product
                   value. An optimized search finds response suggestions. The
                   method is evaluated in a large-scale commercial e-mail
                   application, Inbox by Gmail. Compared to a
                   sequence-to-sequence approach, the new system achieves the
                   same quality at a small fraction of the computational
                   requirements and latency.",
  month         =  "1~" # may,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1705.00652"
}

@ARTICLE{Athiwaratkun2017-sn,
  title         = "Multimodal Word Distributions",
  author        = "Athiwaratkun, Ben and Wilson, Andrew Gordon",
  abstract      = "Word embeddings provide point representations of words
                   containing useful semantic information. We introduce
                   multimodal word distributions formed from Gaussian mixtures,
                   for multiple word meanings, entailment, and rich uncertainty
                   information. To learn these distributions, we propose an
                   energy-based max-margin objective. We show that the
                   resulting approach captures uniquely expressive semantic
                   information, and outperforms alternatives, such as word2vec
                   skip-grams, and Gaussian embeddings, on benchmark datasets
                   such as word similarity and entailment.",
  month         =  "27~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1704.08424"
}

@INPROCEEDINGS{Qin2017-jr,
  title      = "A Targeted Retraining Scheme of Unsupervised Word Embeddings
                for Specific Supervised Tasks",
  booktitle  = "Advances in Knowledge Discovery and Data Mining",
  author     = "Qin, Pengda and Xu, Weiran and Guo, Jun",
  abstract   = "This paper proposes a simple retraining scheme to purposefully
                adjust unsupervised word embeddings for specific supervised
                tasks, such as sentence classification. Different from the
                current methods, which fine-tune word embeddings in training
                set through the supervised learning procedure, our method
                treats the labels of task as implicit context information to
                retrain word embeddings, so that every required word for the
                intended task obtains task-specific representation. Moreover,
                because our method is independent of the supervised learning
                process, it has less risk of over-fitting. We have validated
                the rationality of our method on various sentence
                classification tasks. The improvements of accuracy are
                remarkable, when only scarce training set is available.",
  publisher  = "Springer, Cham",
  pages      = "3--14",
  month      =  "23~" # may,
  year       =  2017,
  keywords   = "Latently",
  language   = "en",
  conference = "Pacific-Asia Conference on Knowledge Discovery and Data Mining"
}

@ARTICLE{Lau_undated-fe,
  title    = "Topically Driven Neural Language Model",
  author   = "Lau, Jey Han and Baldwin, Timothy and Cohn, Trevor",
  keywords = "Latently"
}

@ARTICLE{Liang2017-io,
  title         = "Deep Text Classification Can be Fooled",
  author        = "Liang, Bin and Li, Hongcheng and Su, Miaoqiang and Bian, Pan
                   and Li, Xirong and Shi, Wenchang",
  abstract      = "Deep neural networks (DNNs) play a key role in many
                   applications. Current studies focus on crafting adversarial
                   samples against DNN-based image classifiers by introducing
                   some imperceptible perturbations to the input. However, DNNs
                   for natural language processing have not got the attention
                   they deserve. In fact, the existing perturbation algorithms
                   for images cannot be directly applied to text. This paper
                   presents a simple but effective method to attack DNN-based
                   text classifiers. Three perturbation strategies, namely
                   insertion, modification, and removal, are designed to
                   generate an adversarial sample for a given text. By
                   computing the cost gradients, what should be inserted,
                   modified or removed, where to insert and how to modify are
                   determined effectively. The experimental results show that
                   the adversarial samples generated by our method can
                   successfully fool a state-of-the-art model to misclassify
                   them as any desirable classes without compromising their
                   utilities. At the same time, the introduced perturbations
                   are difficult to be perceived. Our study demonstrates that
                   DNN-based text classifiers are also prone to the adversarial
                   sample attack.",
  month         =  "26~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CR",
  eprint        = "1704.08006"
}

@ARTICLE{Das2017-ge,
  title         = "Question Answering on Knowledge Bases and Text using
                   Universal Schema and Memory Networks",
  author        = "Das, Rajarshi and Zaheer, Manzil and Reddy, Siva and
                   McCallum, Andrew",
  abstract      = "Existing question answering methods infer answers either
                   from a knowledge base or from raw text. While knowledge base
                   (KB) methods are good at answering compositional questions,
                   their performance is often affected by the incompleteness of
                   the KB. Au contraire, web text contains millions of facts
                   that are absent in the KB, however in an unstructured form.
                   \{\textbackslashit Universal schema\} can support reasoning
                   on the union of both structured KBs and unstructured text by
                   aligning them in a common embedded space. In this paper we
                   extend universal schema to natural language question
                   answering, employing \textbackslashemph\{memory networks\}
                   to attend to the large body of facts in the combination of
                   text and KB. Our models can be trained in an end-to-end
                   fashion on question-answer pairs. Evaluation results on
                   \textbackslashspades fill-in-the-blank question answering
                   dataset show that exploiting universal schema for question
                   answering is better than using either a KB or text alone.
                   This model also outperforms the current state-of-the-art by
                   8.5 $F_1$ points.\textbackslashfootnote\{Code and data
                   available in
                   \textbackslashurl\{https://rajarshd.github.io/TextKBQA\}\}",
  month         =  "27~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.08384"
}

@ARTICLE{Bojarski2017-yp,
  title         = "Explaining How a Deep Neural Network Trained with
                   {End-to-End} Learning Steers a Car",
  author        = "Bojarski, Mariusz and Yeres, Philip and Choromanska, Anna
                   and Choromanski, Krzysztof and Firner, Bernhard and Jackel,
                   Lawrence and Muller, Urs",
  abstract      = "As part of a complete software stack for autonomous driving,
                   NVIDIA has created a neural-network-based system, known as
                   PilotNet, which outputs steering angles given images of the
                   road ahead. PilotNet is trained using road images paired
                   with the steering angles generated by a human driving a
                   data-collection car. It derives the necessary domain
                   knowledge by observing human drivers. This eliminates the
                   need for human engineers to anticipate what is important in
                   an image and foresee all the necessary rules for safe
                   driving. Road tests demonstrated that PilotNet can
                   successfully perform lane keeping in a wide variety of
                   driving conditions, regardless of whether lane markings are
                   present or not. The goal of the work described here is to
                   explain what PilotNet learns and how it makes its decisions.
                   To this end we developed a method for determining which
                   elements in the road image most influence PilotNet's
                   steering decision. Results show that PilotNet indeed learns
                   to recognize relevant objects on the road. In addition to
                   learning the obvious features such as lane markings, edges
                   of roads, and other cars, PilotNet learns more subtle
                   features that would be hard to anticipate and program by
                   engineers, for example, bushes lining the edge of the road
                   and atypical vehicle classes.",
  month         =  "25~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1704.07911"
}

@ARTICLE{Li2017-yp,
  title         = "Inferring The Latent Structure of Human {Decision-Making}
                   from Raw Visual Inputs",
  author        = "Li, Yunzhu and Song, Jiaming and Ermon, Stefano",
  abstract      = "The goal of imitation learning is to match example expert
                   behavior, without access to a reinforcement signal. Expert
                   demonstrations provided by humans, however, often show
                   significant variability due to latent factors that are not
                   explicitly modeled. We introduce an extension to the
                   Generative Adversarial Imitation Learning method that can
                   infer the latent structure of human decision-making in an
                   unsupervised way. Our method can not only imitate complex
                   behaviors, but also learn interpretable and meaningful
                   representations. We demonstrate that the approach is
                   applicable to high-dimensional environments including raw
                   visual inputs. In the highway driving domain, we show that a
                   model learned from demonstrations is able to both produce
                   different styles of human-like driving behaviors and
                   accurately anticipate human actions. Our method surpasses
                   various baselines in terms of performance and functionality.",
  month         =  "26~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1703.08840"
}

@INCOLLECTION{Saggion2007-yc,
  title     = "{Ontology-Based} Information Extraction for Business
               Intelligence",
  booktitle = "The Semantic Web",
  author    = "Saggion, Horacio and Funk, Adam and Maynard, Diana and
               Bontcheva, Kalina",
  abstract  = "Business Intelligence (BI) requires the acquisition and
               aggregation of key pieces of knowledge from multiple sources in
               order to provide valuable information to customers or feed
               statistical BI models and tools. The massive amount of
               information available to business analysts makes information
               extraction and other natural language processing tools key
               enablers for the acquisition and use of that semantic
               information. We describe the application of ontology-based
               extraction and merging in the context of a practical e-business
               application for the EU MUSING Project where the goal is to
               gather international company intelligence and country/region
               information. The results of our experiments so far are very
               promising and we are now in the process of building a complete
               end-to-end solution.",
  publisher = "Springer, Berlin, Heidelberg",
  pages     = "843--856",
  year      =  2007,
  keywords  = "Latently",
  language  = "en"
}

@ARTICLE{Negrinho2017-hu,
  title         = "{DeepArchitect}: Automatically Designing and Training Deep
                   Architectures",
  author        = "Negrinho, Renato and Gordon, Geoff",
  abstract      = "In deep learning, performance is strongly affected by the
                   choice of architecture and hyperparameters. While there has
                   been extensive work on automatic hyperparameter optimization
                   for simple spaces, complex spaces such as the space of deep
                   architectures remain largely unexplored. As a result, the
                   choice of architecture is done manually by the human expert
                   through a slow trial and error process guided mainly by
                   intuition. In this paper we describe a framework for
                   automatically designing and training deep models. We propose
                   an extensible and modular language that allows the human
                   expert to compactly represent complex search spaces over
                   architectures and their hyperparameters. The resulting
                   search spaces are tree-structured and therefore easy to
                   traverse. Models can be automatically compiled to
                   computational graphs once values for all hyperparameters
                   have been chosen. We can leverage the structure of the
                   search space to introduce different model search algorithms,
                   such as random search, Monte Carlo tree search (MCTS), and
                   sequential model-based optimization (SMBO). We present
                   experiments comparing the different algorithms on CIFAR-10
                   and show that MCTS and SMBO outperform random search. In
                   addition, these experiments show that our framework can be
                   used effectively for model discovery, as it is possible to
                   describe expressive search spaces and discover competitive
                   models without much effort from the human expert. Code for
                   our framework and experiments has been made publicly
                   available.",
  month         =  "28~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1704.08792"
}

@ARTICLE{Vijayakumar2017-us,
  title         = "{Sound-Word2Vec}: Learning Word Representations Grounded in
                   Sounds",
  author        = "Vijayakumar, Ashwin K and Vedantam, Ramakrishna and Parikh,
                   Devi",
  abstract      = "To be able to interact better with humans, it is crucial for
                   machines to understand sound - a primary modality of human
                   perception. Previous works have used sound to learn
                   embeddings for improved generic textual similarity
                   assessment. In this work, we treat sound as a first-class
                   citizen, studying downstream textual tasks which require
                   aural grounding. To this end, we propose sound-word2vec - a
                   new embedding scheme that learns specialized word embeddings
                   grounded in sounds. For example, we learn that two seemingly
                   (semantically) unrelated concepts, like leaves and paper are
                   similar due to the similar rustling sounds they make. Our
                   embeddings prove useful in textual tasks requiring aural
                   reasoning like text-based sound retrieval and discovering
                   foley sound effects (used in movies). Moreover, our
                   embedding space captures interesting dependencies between
                   words and onomatopoeia and outperforms prior work on
                   aurally-relevant word relatedness datasets such as AMEN and
                   ASLex.",
  month         =  "6~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.01720"
}

@ARTICLE{Iyer2017-ch,
  title         = "Learning a Neural Semantic Parser from User Feedback",
  author        = "Iyer, Srinivasan and Konstas, Ioannis and Cheung, Alvin and
                   Krishnamurthy, Jayant and Zettlemoyer, Luke",
  abstract      = "We present an approach to rapidly and easily build natural
                   language interfaces to databases for new domains, whose
                   performance improves over time based on user feedback, and
                   requires minimal intervention. To achieve this, we adapt
                   neural sequence models to map utterances directly to SQL
                   with its full expressivity, bypassing any intermediate
                   meaning representations. These models are immediately
                   deployed online to solicit feedback from real users to flag
                   incorrect queries. Finally, the popularity of SQL
                   facilitates gathering annotations for incorrect predictions
                   using the crowd, which is directly used to improve our
                   models. This complete feedback loop, without intermediate
                   representations or database specific engineering, opens up
                   new ways of building high quality semantic parsers.
                   Experiments suggest that this approach can be deployed
                   quickly for any new target domain, as we show by learning a
                   semantic parser for an online academic database from
                   scratch.",
  month         =  "27~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.08760"
}

@ARTICLE{noauthor_undated-hz,
  title    = "Mapping Instructions and Visual Observations to Actions with
              Reinforcement Learning",
  author   = "Misra, Dipendra K and Langford, John and Artzi, Yoav",
  abstract = "We propose to directly map raw visual observations and text input
              to actions for instruction execution. While existing approaches
              assume access to structured environment representations or use a
              pipeline of separately trained models, we learn a single model to
              jointly reason about linguistic and visual input. We use
              reinforcement learning in a contextual bandit setting to train a
              neural network agent. To guide the agent's exploration, we use
              reward shaping with different forms of supervision. Our approach
              does not require intermediate representations, planning
              procedures, or training different models. We evaluate in a
              simulated environment, and show significant improvements over
              supervised learning and common reinforcement learning variants.",
  journal  = "arXiv [cs.CL]",
  month    =  "28~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@MISC{noauthor_undated-gc,
  title        = "Enron Email Dataset",
  howpublished = "\url{https://www.cs.cmu.edu/~./enron/}",
  note         = "Accessed: 2017-4-30",
  keywords     = "Latently"
}

@ARTICLE{noauthor_undated-hl,
  title    = "Naturalizing a Programming Language via Interactive Learning",
  author   = "Wang, Sida I and Ginn, Samuel and Liang, Percy and Manning,
              Christoper D",
  abstract = "Our goal is to create a convenient natural language interface for
              performing well-specified but complex actions such as analyzing
              data, manipulating text, and querying databases. However,
              existing natural language interfaces for such tasks are quite
              primitive compared to the power one wields with a programming
              language. To bridge this gap, we start with a core programming
              language and allow users to ``naturalize'' the core language
              incrementally by defining alternative, more natural syntax and
              increasingly complex concepts in terms of compositions of simpler
              ones. In a voxel world, we show that a community of users can
              simultaneously teach a common system a diverse language and use
              it to build hundreds of complex voxel structures. Over the course
              of three days, these users went from using only the core language
              to using the naturalized language in 85.9\% of the last 10K
              utterances.",
  journal  = "arXiv [cs.CL]",
  month    =  "23~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Gu2017-ux,
  title         = "{DeepAM}: Migrate {APIs} with Multi-modal Sequence to
                   Sequence Learning",
  author        = "Gu, Xiaodong and Zhang, Hongyu and Zhang, Dongmei and Kim,
                   Sunghun",
  abstract      = "Computer programs written in one language are often required
                   to be ported to other languages to support multiple devices
                   and environments. When programs use language specific APIs
                   (Application Programming Interfaces), it is very challenging
                   to migrate these APIs to the corresponding APIs written in
                   other languages. Existing approaches mine API mappings from
                   projects that have corresponding versions in two languages.
                   They rely on the sparse availability of bilingual projects,
                   thus producing a limited number of API mappings. In this
                   paper, we propose an intelligent system called DeepAM for
                   automatically mining API mappings from a large-scale code
                   corpus without bilingual projects. The key component of
                   DeepAM is based on the multimodal sequence to sequence
                   learning architecture that aims to learn joint semantic
                   representations of bilingual API sequences from big source
                   code data. Experimental results indicate that DeepAM
                   significantly increases the accuracy of API mappings as well
                   as the number of API mappings, when compared with the
                   state-of-the-art approaches.",
  month         =  "25~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.SE",
  eprint        = "1704.07734"
}

@TECHREPORT{Blei2017-dl,
  title       = "Scalable Topic Modeling: Online Learning, Diagnostics, and
                 Recommendation",
  author      = "Blei, David",
  institution = "COLUMBIA UNIV NEW YORK NEW YORK United States",
  year        =  2017,
  keywords    = "Latently"
}

@ARTICLE{Khodak2017-nu,
  title         = "A Large {Self-Annotated} Corpus for Sarcasm",
  author        = "Khodak, Mikhail and Saunshi, Nikunj and Vodrahalli, Kiran",
  abstract      = "We introduce the Self-Annotated Reddit Corpus (SARC), a
                   large corpus for sarcasm research and for training and
                   evaluating systems for sarcasm detection. The corpus has 1.3
                   million sarcastic statements -- 10 times more than any
                   previous dataset -- and many times more instances of
                   non-sarcastic statements, allowing for learning in regimes
                   of both balanced and unbalanced labels. Each statement is
                   furthermore self-annotated -- sarcasm is labeled by the
                   author and not an independent annotator -- and provided with
                   user, topic, and conversation context. We evaluate the
                   corpus for accuracy, compare it to previous related corpora,
                   and provide baselines for the task of sarcasm detection.",
  month         =  "19~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.05579"
}

@ARTICLE{Weston2014-hr,
  title         = "Memory Networks",
  author        = "Weston, Jason and Chopra, Sumit and Bordes, Antoine",
  abstract      = "We describe a new class of learning models called memory
                   networks. Memory networks reason with inference components
                   combined with a long-term memory component; they learn how
                   to use these jointly. The long-term memory can be read and
                   written to, with the goal of using it for prediction. We
                   investigate these models in the context of question
                   answering (QA) where the long-term memory effectively acts
                   as a (dynamic) knowledge base, and the output is a textual
                   response. We evaluate them on a large-scale QA task, and a
                   smaller, but more complex, toy task generated from a
                   simulated world. In the latter, we show the reasoning power
                   of such models by chaining multiple supporting sentences to
                   answer questions that require understanding the intension of
                   verbs.",
  month         =  "15~" # oct,
  year          =  2014,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1410.3916v11"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Fleming2017-qs,
  title     = "{HMeta-d}: hierarchical Bayesian estimation of metacognitive
               efficiency from confidence ratings",
  author    = "Fleming, Stephen M",
  abstract  = "Metacognition refers to the ability to reflect on and monitor
               one’s cognitive processes, such as perception, memory and
               decision-making. Metacognition is often assessed in the lab by
               whether an observer’s confidence ratings are predictive of
               objective success, but simple correlations between performance
               and confidence are susceptible to undesirable influences such as
               response biases. Recently, an alternative approach to measuring
               metacognition has been developed (Maniscalco and Lau 2012) that
               characterizes metacognitive sensitivity (meta-d') by assuming a
               generative model of confidence within the framework of signal
               detection theory. However, current estimation routines require
               an abundance of confidence rating data to recover robust
               parameters, and only provide point estimates of meta-d’. In
               contrast, hierarchical Bayesian estimation methods provide
               opportunities to enhance statistical power, incorporate
               uncertainty in group-level parameter estimates and avoid
               edge-correction confounds. Here I introduce such a method for
               estimating metacognitive efficiency (meta-d’/d’) from confidence
               ratings and demonstrate its application for assessing group
               differences. A tutorial is provided on both the meta-d’ model
               and the preparation of behavioural data for model fitting.
               Through numerical simulations I show that a hierarchical
               approach outperforms alternative fitting methods in situations
               where limited data are available, such as when quantifying
               metacognition in patient populations. In addition, the model may
               be flexibly expanded to estimate parameters encoding other
               influences on metacognitive efficiency. MATLAB software and
               documentation for implementing hierarchical meta-d’ estimation
               (HMeta-d) can be downloaded at
               https://github.com/smfleming/HMeta-d.",
  journal   = "Neurosci Conscious",
  publisher = "Oxford University Press",
  volume    =  3,
  number    =  1,
  month     =  "1~" # jan,
  year      =  2017,
  keywords  = "Latently"
}

@ARTICLE{Athiwaratkun2017-fp,
  title         = "Multimodal Word Distributions",
  author        = "Athiwaratkun, Ben and Wilson, Andrew Gordon",
  abstract      = "Word embeddings provide point representations of words
                   containing useful semantic information. We introduce
                   multimodal word distributions formed from Gaussian mixtures,
                   for multiple word meanings, entailment, and rich uncertainty
                   information. To learn these distributions, we propose an
                   energy-based max-margin objective. We show that the
                   resulting approach captures uniquely expressive semantic
                   information, and outperforms alternatives, such as word2vec
                   skip-grams, and Gaussian embeddings, on benchmark datasets
                   such as word similarity and entailment.",
  month         =  "27~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1704.08424"
}

@ARTICLE{Jain2017-ui,
  title         = "Accelerating Stochastic Gradient Descent",
  author        = "Jain, Prateek and Kakade, Sham M and Kidambi, Rahul and
                   Netrapalli, Praneeth and Sidford, Aaron",
  abstract      = "There is widespread sentiment that it is not possible to
                   effectively utilize fast gradient methods (e.g. Nesterov's
                   acceleration, conjugate gradient, heavy ball) for the
                   purposes of stochastic optimization due to their instability
                   and error accumulation, a notion made precise in d'Aspremont
                   2008 and Devolder, Glineur, and Nesterov 2014. This work
                   considers these issues for the special case of stochastic
                   approximation for the least squares regression problem, and
                   our main result refutes the conventional wisdom by showing
                   that acceleration can be made robust to statistical errors.
                   In particular, this work introduces an accelerated
                   stochastic gradient method that provably achieves the
                   minimax optimal statistical risk faster than stochastic
                   gradient descent. Critical to the analysis is a sharp
                   characterization of accelerated stochastic gradient descent
                   as a stochastic process. We hope this characterization gives
                   insights towards the broader question of designing simple
                   and effective accelerated stochastic methods for more
                   general convex and non-convex optimization problems.",
  month         =  "26~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1704.08227"
}

@ARTICLE{noauthor_undated-fh,
  title    = "Enhanced {LSTM} for Natural Language Inference",
  author   = "Chen, Qian and Zhu, Xiaodan and Ling, Zhenhua and Wei, Si and
              Jiang, Hui and Inkpen, Diana",
  abstract = "Reasoning and inference are central to human and artificial
              intelligence. Modeling inference in human language is very
              challenging. With the availability of large annotated data
              (Bowman et al., 2015), it has recently become feasible to train
              neural network based inference models, which have shown to be
              very effective. In this paper, we present a new state-of-the-art
              result, achieving the accuracy of 88.6\% on the Stanford Natural
              Language Inference Dataset. Unlike the previous top models that
              use very complicated network architectures, we first demonstrate
              that carefully designing sequential inference models based on
              chain LSTMs can outperform all previous models. Based on this, we
              further show that by explicitly considering recursive
              architectures in both local inference modeling and inference
              composition, we achieve additional improvement. Particularly,
              incorporating syntactic parsing information contributes to our
              best result---it further improves the performance even when added
              to the already very strong model.",
  journal  = "arXiv [cs.CL]",
  month    =  "20~" # sep,
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{Scellier2016-sj,
  title         = "Equilibrium Propagation: Bridging the Gap Between
                   {Energy-Based} Models and Backpropagation",
  author        = "Scellier, Benjamin and Bengio, Yoshua",
  abstract      = "We introduce Equilibrium Propagation, a learning framework
                   for energy-based models. It involves only one kind of neural
                   computation, performed in both the first phase (when the
                   prediction is made) and the second phase of training (after
                   the target or prediction error is revealed). Although this
                   algorithm computes the gradient of an objective function
                   just like Backpropagation, it does not need a special
                   computation or circuit for the second phase, where errors
                   are implicitly propagated. Equilibrium Propagation shares
                   similarities with Contrastive Hebbian Learning and
                   Contrastive Divergence while solving the theoretical issues
                   of both algorithms: our algorithm computes the gradient of a
                   well defined objective function. Because the objective
                   function is defined in terms of local perturbations, the
                   second phase of Equilibrium Propagation corresponds to only
                   nudging the prediction (fixed point, or stationary
                   distribution) towards a configuration that reduces
                   prediction error. In the case of a recurrent multi-layer
                   supervised network, the output units are slightly nudged
                   towards their target in the second phase, and the
                   perturbation introduced at the output layer propagates
                   backward in the hidden layers. We show that the signal
                   'back-propagated' during this second phase corresponds to
                   the propagation of error derivatives and encodes the
                   gradient of the objective function, when the synaptic update
                   corresponds to a standard form of spike-timing dependent
                   plasticity. This work makes it more plausible that a
                   mechanism similar to Backpropagation could be implemented by
                   brains, since leaky integrator neural computation performs
                   both inference and error back-propagation in our model. The
                   only local difference between the two phases is whether
                   synaptic changes are allowed or not.",
  month         =  "16~" # feb,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1602.05179"
}

@ARTICLE{Ollivier2015-la,
  title         = "Training recurrent networks online without backtracking",
  author        = "Ollivier, Yann and Tallec, Corentin and Charpiat, Guillaume",
  abstract      = "We introduce the ``NoBackTrack'' algorithm to train the
                   parameters of dynamical systems such as recurrent neural
                   networks. This algorithm works in an online, memoryless
                   setting, thus requiring no backpropagation through time, and
                   is scalable, avoiding the large computational and memory
                   cost of maintaining the full gradient of the current state
                   with respect to the parameters. The algorithm essentially
                   maintains, at each time, a single search direction in
                   parameter space. The evolution of this search direction is
                   partly stochastic and is constructed in such a way to
                   provide, at every time, an unbiased random estimate of the
                   gradient of the loss function with respect to the
                   parameters. Because the gradient estimate is unbiased, on
                   average over time the parameter is updated as it should. The
                   resulting gradient estimate can then be fed to a lightweight
                   Kalman-like filter to yield an improved algorithm. For
                   recurrent neural networks, the resulting algorithms scale
                   linearly with the number of parameters. Small-scale
                   experiments confirm the suitability of the approach, showing
                   that the stochastic approximation of the gradient introduced
                   in the algorithm is not detrimental to learning. In
                   particular, the Kalman-like version of NoBackTrack is
                   superior to backpropagation through time (BPTT) when the
                   time span of dependencies in the data is longer than the
                   truncation span for BPTT.",
  month         =  "28~" # jul,
  year          =  2015,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1507.07680"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{noauthor_undated-fk,
  title        = "{MG4J}: Managing Gigabytes for Java™",
  howpublished = "\url{http://mg4j.di.unimi.it/}",
  note         = "Accessed: 2017-4-26",
  keywords     = "Latently"
}

@MISC{noauthor_undated-rv,
  title        = "A New {DAWN} for Data Analytics",
  abstract     = "The Stanford DAWN blog",
  howpublished = "\url{http://dawn.cs.stanford.edu/blog/dawn-intro.html}",
  note         = "Accessed: 2017-4-25",
  keywords     = "Latently"
}

@ARTICLE{Wu2017-hh,
  title         = "Adversarial Neural Machine Translation",
  author        = "Wu, Lijun and Xia, Yingce and Zhao, Li and Tian, Fei and
                   Qin, Tao and Lai, Jianhuang and Liu, Tie-Yan",
  abstract      = "In this paper, we study a new learning paradigm for Neural
                   Machine Translation (NMT). Instead of maximizing the
                   likelihood of the human translation as in previous works, we
                   minimize the distinction between human translation and the
                   translation given by a NMT model. To achieve this goal,
                   inspired by the recent success of generative adversarial
                   networks (GANs), we employ an adversarial training
                   architecture and name it as Adversarial-NMT. In
                   Adversarial-NMT, the training of the NMT model is assisted
                   by an adversary, which is an elaborately designed
                   Convolutional Neural Network (CNN). The goal of the
                   adversary is to differentiate the translation result
                   generated by the NMT model from that by human. The goal of
                   the NMT model is to produce high quality translations so as
                   to cheat the adversary. A policy gradient method is
                   leveraged to co-train the NMT model and the adversary.
                   Experimental results on English$\rightarrow$French and
                   German$\rightarrow$English translation tasks show that
                   Adversarial-NMT can achieve significantly better translation
                   quality than several strong baselines.",
  month         =  "20~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.06933"
}

@ARTICLE{noauthor_undated-tg,
  title    = "Deep Keyphrase Generation",
  author   = "Meng, Rui and Zhao, Sanqiang and Han, Shuguang and He, Daqing and
              Brusilovsky, Peter and Chi, Yu",
  abstract = "Keyphrase provides highly-summative information that can be
              effectively used for understanding, organizing and retrieving
              text content. Though previous studies have provided many workable
              solutions for automated keyphrase extraction, they commonly
              divided the to-be-summarized content into multiple text chunks,
              then ranked and selected the most meaningful ones. These
              approaches could neither identify keyphrases that do not appear
              in the text, nor capture the real semantic meaning behind the
              text. We propose a generative model for keyphrase prediction with
              an encoder-decoder framework, which can effectively overcome the
              above drawbacks. We name it as deep keyphrase generation since it
              attempts to capture the deep semantic meaning of the content with
              a deep learning method. Empirical analysis on six datasets
              demonstrates that our proposed model not only achieves a
              significant performance boost on extracting keyphrases that
              appear in the source text, but also can generate absent
              keyphrases based on the semantic meaning of the text. Code and
              dataset are available at
              https://github.com/memray/seq2seq-keyphrase.",
  journal  = "arXiv [cs.CL]",
  month    =  "23~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Niculae2017-ra,
  title         = "Argument Mining with Structured {SVMs} and {RNNs}",
  author        = "Niculae, Vlad and Park, Joonsuk and Cardie, Claire",
  abstract      = "We propose a novel factor graph model for argument mining,
                   designed for settings in which the argumentative relations
                   in a document do not necessarily form a tree structure.
                   (This is the case in over 20\% of the web comments dataset
                   we release.) Our model jointly learns elementary unit type
                   classification and argumentative relation prediction.
                   Moreover, our model supports SVM and RNN parametrizations,
                   can enforce structure constraints (e.g., transitivity), and
                   can express dependencies between adjacent relations and
                   propositions. Our approaches outperform unstructured
                   baselines in both web comments and argumentative essay
                   datasets.",
  month         =  "23~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.06869"
}

@ARTICLE{Dhingra2016-dp,
  title         = "{Gated-Attention} Readers for Text Comprehension",
  author        = "Dhingra, Bhuwan and Liu, Hanxiao and Yang, Zhilin and Cohen,
                   William W and Salakhutdinov, Ruslan",
  abstract      = "In this paper we study the problem of answering cloze-style
                   questions over documents. Our model, the Gated-Attention
                   (GA) Reader, integrates a multi-hop architecture with a
                   novel attention mechanism, which is based on multiplicative
                   interactions between the query embedding and the
                   intermediate states of a recurrent neural network document
                   reader. This enables the reader to build query-specific
                   representations of tokens in the document for accurate
                   answer selection. The GA Reader obtains state-of-the-art
                   results on three benchmarks for this task--the CNN \& Daily
                   Mail news stories and the Who Did What dataset. The
                   effectiveness of multiplicative interaction is demonstrated
                   by an ablation study, and by comparing to alternative
                   compositional operators for implementing the
                   gated-attention. The code is available at
                   https://github.com/bdhingra/ga-reader.",
  month         =  "5~" # jun,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.01549"
}

@ARTICLE{noauthor_undated-kw,
  title    = "Selective Encoding for Abstractive Sentence Summarization",
  author   = "Zhou, Qingyu and Yang, Nan and Wei, Furu and Zhou, Ming",
  abstract = "We propose a selective encoding model to extend the
              sequence-to-sequence framework for abstractive sentence
              summarization. It consists of a sentence encoder, a selective
              gate network, and an attention equipped decoder. The sentence
              encoder and decoder are built with recurrent neural networks. The
              selective gate network constructs a second level sentence
              representation by controlling the information flow from encoder
              to decoder. The second level representation is tailored for
              sentence summarization task, which leads to better performance.
              We evaluate our model on the English Gigaword, DUC 2004 and MSR
              abstractive sentence summarization datasets. The experimental
              results show that the proposed selective encoding model
              outperforms the state-of-the-art baseline models.",
  journal  = "arXiv [cs.CL]",
  month    =  "24~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Hughes2017-uv,
  title         = "Medical Text Classification using Convolutional Neural
                   Networks",
  author        = "Hughes, Mark and Li, Irene and Kotoulas, Spyros and
                   Suzumura, Toyotaro",
  abstract      = "We present an approach to automatically classify clinical
                   text at a sentence level. We are using deep convolutional
                   neural networks to represent complex features. We train the
                   network on a dataset providing a broad categorization of
                   health information. Through a detailed evaluation, we
                   demonstrate that our method outperforms several approaches
                   widely used in natural language processing tasks by about
                   15\%.",
  month         =  "22~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.06841"
}

@ARTICLE{Kawakami2017-aj,
  title         = "Learning to Create and Reuse Words in {Open-Vocabulary}
                   Neural Language Modeling",
  author        = "Kawakami, Kazuya and Dyer, Chris and Blunsom, Phil",
  abstract      = "Fixed-vocabulary language models fail to account for one of
                   the most characteristic statistical facts of natural
                   language: the frequent creation and reuse of new word types.
                   Although character-level language models offer a partial
                   solution in that they can create word types not attested in
                   the training corpus, they do not capture the ``bursty''
                   distribution of such words. In this paper, we augment a
                   hierarchical LSTM language model that generates sequences of
                   word tokens character by character with a caching mechanism
                   that learns to reuse previously generated words. To validate
                   our model we construct a new open-vocabulary language
                   modeling corpus (the Multilingual Wikipedia Corpus, MWC)
                   from comparable Wikipedia articles in 7 typologically
                   diverse languages and demonstrate the effectiveness of our
                   model across this range of languages.",
  month         =  "23~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.06986"
}

@ARTICLE{Yu2017-jx,
  title         = "Learning to Skim Text",
  author        = "Yu, Adams Wei and Lee, Hongrae and Le, Quoc V",
  abstract      = "Recurrent Neural Networks are showing much promise in many
                   sub-areas of natural language processing, ranging from
                   document classification to machine translation to automatic
                   question answering. Despite their promise, many recurrent
                   models have to read the whole text word by word, making it
                   slow to handle long documents. For example, it is difficult
                   to use a recurrent network to read a book and answer
                   questions about it. In this paper, we present an approach of
                   reading text while skipping irrelevant information if
                   needed. The underlying model is a recurrent network that
                   learns how far to jump after reading a few words of the
                   input text. We employ a standard policy gradient method to
                   train the model to make discrete jumping decisions. In our
                   benchmarks on four different tasks, including number
                   prediction, sentiment analysis, news article classification
                   and automatic Q\&A, our proposed model, a modified LSTM with
                   jumping, is up to 6 times faster than the standard
                   sequential LSTM, while maintaining the same or even better
                   accuracy.",
  month         =  "23~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.06877"
}

@ARTICLE{noauthor_undated-iz,
  title    = "Bandit Structured Prediction for Neural {Sequence-to-Sequence}
              Learning",
  author   = "Kreutzer, Julia and Sokolov, Artem and Riezler, Stefan",
  abstract = "Bandit structured prediction describes a stochastic optimization
              framework where learning is performed from partial feedback. This
              feedback is received in the form of a task loss evaluation to a
              predicted output structure, without having access to gold
              standard structures. We advance this framework by lifting linear
              bandit learning to neural sequence-to-sequence learning problems
              using attention-based recurrent neural networks. Furthermore, we
              show how to incorporate control variates into our learning
              algorithms for variance reduction and improved generalization. We
              present an evaluation on a neural machine translation task that
              shows improvements of up to 5.89 BLEU points for domain
              adaptation from simulated bandit feedback.",
  journal  = "arXiv [stat.ML]",
  month    =  "21~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Iyyer2016-ab,
  title         = "The Amazing Mysteries of the Gutter: Drawing Inferences
                   Between Panels in Comic Book Narratives",
  author        = "Iyyer, Mohit and Manjunatha, Varun and Guha, Anupam and
                   Vyas, Yogarshi and Boyd-Graber, Jordan and Iii, Hal
                   Daum{\'e} and Davis, Larry",
  abstract      = "Visual narrative is often a combination of explicit
                   information and judicious omissions, relying on the viewer
                   to supply missing details. In comics, most movements in time
                   and space are hidden in the ``gutters'' between panels. To
                   follow the story, readers logically connect panels together
                   by inferring unseen actions through a process called
                   ``closure''. While computers can now describe the content of
                   natural images, in this paper we examine whether they can
                   understand the closure-driven narratives conveyed by
                   stylized artwork and dialogue in comic book panels. We
                   collect a dataset, COMICS, that consists of over 1.2 million
                   panels (120 GB) paired with automatic textbox
                   transcriptions. An in-depth analysis of COMICS demonstrates
                   that neither text nor image alone can tell a comic book
                   story, so a computer must understand both modalities to keep
                   up with the plot. We introduce three cloze-style tasks that
                   ask models to predict narrative and character-centric
                   aspects of a panel given n preceding panels as context.
                   Various deep neural architectures underperform human
                   baselines on these tasks, suggesting that COMICS contains
                   fundamental challenges for both vision and language.",
  month         =  "16~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1611.05118"
}

@ARTICLE{Springer2017-ma,
  title         = "{HPTT}: A {High-Performance} Tensor Transposition C++
                   Library",
  author        = "Springer, Paul and Su, Tong and Bientinesi, Paolo",
  abstract      = "Recently we presented TTC, a domain-specific compiler for
                   tensor transpositions. Despite the fact that the performance
                   of the generated code is nearly optimal, due to its offline
                   nature, TTC cannot be utilized in all the application codes
                   in which the tensor sizes and the necessary tensor
                   permutations are determined at runtime. To overcome this
                   limitation, we introduce the open-source C++ library
                   High-Performance Tensor Transposition (HPTT). Similar to
                   TTC, HPTT incorporates optimizations such as blocking,
                   multi-threading, and explicit vectorization; furthermore it
                   decomposes any transposition into multiple loops around a so
                   called micro-kernel. This modular design---inspired by
                   BLIS---makes HPTT easy to port to different architectures,
                   by only replacing the hand-vectorized micro-kernel (e.g., a
                   4x4 transpose). HPTT also offers an optional autotuning
                   framework---guided by a performance mode---that explores a
                   vast search space of implementations at runtime (similar to
                   FFTW). Across a wide range of different tensor
                   transpositions and architectures (e.g., Intel Ivy Bridge,
                   Intel Knights Landing, ARMv7, IBM Power7), HPTT attains a
                   bandwidth comparable to that of SAXPY, and yields remarkable
                   speedups over Eigen's tensor transposition implementation.
                   Most importantly, the integration of HPTT into the Cyclops
                   Tensor Framework (CTF) improves the overall performance of
                   tensor contractions by up to 3.1x.",
  month         =  "14~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.MS",
  eprint        = "1704.04374"
}

@ARTICLE{noauthor_undated-vq,
  title    = "Adversarial Machine Learning at Scale",
  author   = "Kurakin, Alexey and Goodfellow, Ian and Bengio, Samy",
  abstract = "Adversarial examples are malicious inputs designed to fool
              machine learning models. They often transfer from one model to
              another, allowing attackers to mount black box attacks without
              knowledge of the target model's parameters. Adversarial training
              is the process of explicitly training a model on adversarial
              examples, in order to make it more robust to attack or to reduce
              its test error on clean inputs. So far, adversarial training has
              primarily been applied to small problems. In this research, we
              apply adversarial training to ImageNet. Our contributions
              include: (1) recommendations for how to succesfully scale
              adversarial training to large models and datasets, (2) the
              observation that adversarial training confers robustness to
              single-step attack methods, (3) the finding that multi-step
              attack methods are somewhat less transferable than single-step
              attack methods, so single-step attacks are the best for mounting
              black-box attacks, and (4) resolution of a ``label leaking''
              effect that causes adversarially trained models to perform better
              on adversarial examples than on clean examples, because the
              adversarial example construction process uses the true label and
              the model can learn to exploit regularities in the construction
              process.",
  journal  = "arXiv [cs.CV]",
  month    =  "4~" # nov,
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{Zhang2016-ci,
  title         = "Understanding deep learning requires rethinking
                   generalization",
  author        = "Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht,
                   Benjamin and Vinyals, Oriol",
  abstract      = "Despite their massive size, successful deep artificial
                   neural networks can exhibit a remarkably small difference
                   between training and test performance. Conventional wisdom
                   attributes small generalization error either to properties
                   of the model family, or to the regularization techniques
                   used during training. Through extensive systematic
                   experiments, we show how these traditional approaches fail
                   to explain why large neural networks generalize well in
                   practice. Specifically, our experiments establish that
                   state-of-the-art convolutional networks for image
                   classification trained with stochastic gradient methods
                   easily fit a random labeling of the training data. This
                   phenomenon is qualitatively unaffected by explicit
                   regularization, and occurs even if we replace the true
                   images by completely unstructured random noise. We
                   corroborate these experimental findings with a theoretical
                   construction showing that simple depth two neural networks
                   already have perfect finite sample expressivity as soon as
                   the number of parameters exceeds the number of data points
                   as it usually does in practice. We interpret our
                   experimental findings by comparison with traditional models.",
  month         =  "10~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1611.03530"
}

@ARTICLE{Cai2017-vp,
  title         = "Making Neural Programming Architectures Generalize via
                   Recursion",
  author        = "Cai, Jonathon and Shin, Richard and Song, Dawn",
  abstract      = "Empirically, neural networks that attempt to learn programs
                   from data have exhibited poor generalizability. Moreover, it
                   has traditionally been difficult to reason about the
                   behavior of these models beyond a certain level of input
                   complexity. In order to address these issues, we propose
                   augmenting neural architectures with a key abstraction:
                   recursion. As an application, we implement recursion in the
                   Neural Programmer-Interpreter framework on four tasks:
                   grade-school addition, bubble sort, topological sort, and
                   quicksort. We demonstrate superior generalizability and
                   interpretability with small amounts of training data.
                   Recursion divides the problem into smaller pieces and
                   drastically reduces the domain of each neural network
                   component, making it tractable to prove guarantees about the
                   overall system's behavior. Our experience suggests that in
                   order for neural architectures to robustly learn program
                   semantics, it is necessary to incorporate a concept like
                   recursion.",
  month         =  "21~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1704.06611"
}

@ARTICLE{Maymin2010-kj,
  title         = "Markets are efficient if and only if {P} = {NP}",
  author        = "Maymin, Philip",
  abstract      = "I prove that if markets are weak-form efficient, meaning
                   current prices fully reflect all information available in
                   past prices, then P = NP, meaning every computational
                   problem whose solution can be verified in polynomial time
                   can also be solved in polynomial time. I also prove the
                   converse by showing how we can ``program'' the market to
                   solve NP-complete problems. Since P probably does not equal
                   NP, markets are probably not efficient. Specifically,
                   markets become increasingly inefficient as the time series
                   lengthens or becomes more frequent. An illustration by way
                   of partitioning the excess returns to momentum strategies
                   based on data availability confirms this prediction.",
  month         =  "11~" # feb,
  year          =  2010,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "q-fin.GN",
  eprint        = "1002.2284"
}

@ARTICLE{Xie2017-el,
  title         = "An Interpretable Knowledge Transfer Model for Knowledge Base
                   Completion",
  author        = "Xie, Qizhe and Ma, Xuezhe and Dai, Zihang and Hovy, Eduard",
  abstract      = "Knowledge bases are important resources for a variety of
                   natural language processing tasks but suffer from
                   incompleteness. We propose a novel embedding model,
                   \textbackslashemph\{ITransF\}, to perform knowledge base
                   completion. Equipped with a sparse attention mechanism,
                   ITransF discovers hidden concepts of relations and transfer
                   statistical strength through the sharing of concepts.
                   Moreover, the learned associations between relations and
                   concepts, which are represented by sparse attention vectors,
                   can be interpreted easily. We evaluate ITransF on two
                   benchmark datasets---WN18 and FB15k for knowledge base
                   completion and obtains improvements on both the mean rank
                   and Hits@10 metrics, over all baselines that do not use
                   additional information.",
  month         =  "19~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.05908"
}

@ARTICLE{noauthor_undated-we,
  title    = "{End-to-End} {Multi-View} Networks for Text Classification",
  author   = "Guo, Hongyu and Cherry, Colin and Su, Jiang",
  abstract = "We propose a multi-view network for text classification. Our
              method automatically creates various views of its input text,
              each taking the form of soft attention weights that distribute
              the classifier's focus among a set of base features. For a
              bag-of-words representation, each view focuses on a different
              subset of the text's words. Aggregating many such views results
              in a more discriminative and robust representation. Through a
              novel architecture that both stacks and concatenates views, we
              produce a network that emphasizes both depth and width, allowing
              training to converge quickly. Using our multi-view architecture,
              we establish new state-of-the-art accuracies on two benchmark
              tasks.",
  journal  = "arXiv [cs.CL]",
  month    =  "19~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@INPROCEEDINGS{Yang2017-vc,
  title     = "{HiText}: Text Reading with Dynamic Salience Marking",
  booktitle = "Proceedings of the 26th International Conference on World Wide
               Web Companion",
  author    = "Yang, Qian and de Melo, Gerard and Cheng, Yong and Wang, Sen",
  publisher = "International World Wide Web Conferences Steering Committee",
  pages     = "311--319",
  series    = "WWW '17 Companion",
  year      =  2017,
  address   = "Republic and Canton of Geneva, Switzerland",
  keywords  = "natural language semantics, text skimming, text
               visualization;Latently"
}

@INPROCEEDINGS{De_Melo2017-fk,
  title     = "Inducing Conceptual Embedding Spaces from Wikipedia",
  booktitle = "Proceedings of the 26th International Conference on World Wide
               Web Companion",
  author    = "de Melo, Gerard",
  publisher = "International World Wide Web Conferences Steering Committee",
  pages     = "43--50",
  series    = "WWW '17 Companion",
  year      =  2017,
  address   = "Republic and Canton of Geneva, Switzerland",
  keywords  = "conceptual knowledge, semantic representations,
               wikipedia;Latently"
}

@INPROCEEDINGS{De_Melo2017-az,
  title     = "Inducing Conceptual Embedding Spaces from Wikipedia",
  booktitle = "Proceedings of the 26th International Conference on World Wide
               Web Companion",
  author    = "de Melo, Gerard",
  publisher = "International World Wide Web Conferences Steering Committee",
  pages     = "43--50",
  series    = "WWW '17 Companion",
  year      =  2017,
  address   = "Republic and Canton of Geneva, Switzerland",
  keywords  = "conceptual knowledge, semantic representations,
               wikipedia;Latently"
}

@ARTICLE{Firat2016-eq,
  title         = "{Multi-Way}, Multilingual Neural Machine Translation with a
                   Shared Attention Mechanism",
  author        = "Firat, Orhan and Cho, Kyunghyun and Bengio, Yoshua",
  abstract      = "We propose multi-way, multilingual neural machine
                   translation. The proposed approach enables a single neural
                   translation model to translate between multiple languages,
                   with a number of parameters that grows only linearly with
                   the number of languages. This is made possible by having a
                   single attention mechanism that is shared across all
                   language pairs. We train the proposed multi-way,
                   multilingual model on ten language pairs from WMT'15
                   simultaneously and observe clear performance improvements
                   over models trained on only one language pair. In
                   particular, we observe that the proposed model significantly
                   improves the translation quality of low-resource language
                   pairs.",
  month         =  "6~" # jan,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1601.01073"
}

@INPROCEEDINGS{Sogaard2016-bd,
  title       = "Deep multi-task learning with low level tasks supervised at
                 lower layers",
  booktitle   = "Proceedings of the 54th Annual Meeting of the Association for
                 Computational Linguistics",
  author      = "S{\o}gaard, Anders and Goldberg, Yoav",
  volume      =  2,
  pages       = "231--235",
  institution = "Association for Computational Linguistics",
  year        =  2016,
  keywords    = "Latently"
}

@ARTICLE{Bingel2017-zx,
  title         = "Identifying beneficial task relations for multi-task
                   learning in deep neural networks",
  author        = "Bingel, Joachim and S{\o}gaard, Anders",
  abstract      = "Multi-task learning (MTL) in deep neural networks for NLP
                   has recently received increasing interest due to some
                   compelling benefits, including its potential to efficiently
                   regularize models and to reduce the need for labeled data.
                   While it has brought significant improvements in a number of
                   NLP tasks, mixed results have been reported, and little is
                   known about the conditions under which MTL leads to gains in
                   NLP. This paper sheds light on the specific task relations
                   that can lead to gains from MTL models over single-task
                   setups.",
  month         =  "27~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.08303"
}

@ARTICLE{Dash2017-yn,
  title         = "{TAC-GAN} - Text Conditioned Auxiliary Classifier Generative
                   Adversarial Network",
  author        = "Dash, Ayushman and Gamboa, John Cristian Borges and Ahmed,
                   Sheraz and Liwicki, Marcus and Afzal, Muhammad Zeshan",
  abstract      = "In this work, we present the Text Conditioned Auxiliary
                   Classifier Generative Adversarial Network, (TAC-GAN) a text
                   to image Generative Adversarial Network (GAN) for
                   synthesizing images from their text descriptions. Former
                   approaches have tried to condition the generative process on
                   the textual data; but allying it to the usage of class
                   information, known to diversify the generated samples and
                   improve their structural coherence, has not been explored.
                   We trained the presented TAC-GAN model on the Oxford-102
                   dataset of flowers, and evaluated the discriminability of
                   the generated images with Inception-Score, as well as their
                   diversity using the Multi-Scale Structural Similarity Index
                   (MS-SSIM). Our approach outperforms the state-of-the-art
                   models, i.e., its inception score is 3.45, corresponding to
                   a relative increase of 7.8\% compared to the recently
                   introduced StackGan. A comparison of the mean MS-SSIM scores
                   of the training and generated samples per class shows that
                   our approach is able to generate highly diverse images with
                   an average MS-SSIM of 0.14 over all generated classes.",
  month         =  "19~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1703.06412"
}

@ARTICLE{Kaplan2017-hk,
  title         = "Beating Atari with Natural Language Guided Reinforcement
                   Learning",
  author        = "Kaplan, Russell and Sauer, Christopher and Sosa, Alexander",
  abstract      = "We introduce the first deep reinforcement learning agent
                   that learns to beat Atari games with the aid of natural
                   language instructions. The agent uses a multimodal embedding
                   between environment observations and natural language to
                   self-monitor progress through a list of English
                   instructions, granting itself reward for completing
                   instructions in addition to increasing the game score. Our
                   agent significantly outperforms Deep Q-Networks (DQNs),
                   Asynchronous Advantage Actor-Critic (A3C) agents, and the
                   best agents posted to OpenAI Gym on what is often considered
                   the hardest Atari 2600 environment: Montezuma's Revenge.",
  month         =  "18~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1704.05539"
}

@ARTICLE{noauthor_undated-gt,
  title    = "An Integrated Simulator and Dataset that Combines Grasping and
              Vision for Deep Learning",
  author   = "Veres, Matthew and Moussa, Medhat and Taylor, Graham W",
  abstract = "Deep learning is an established framework for learning
              hierarchical data representations. While compute power is in
              abundance, one of the main challenges in applying this framework
              to robotic grasping has been obtaining the amount of data needed
              to learn these representations, and structuring the data to the
              task at hand. Among contemporary approaches in the literature, we
              highlight key properties that have encouraged the use of deep
              learning techniques, and in this paper, detail our experience in
              developing a simulator for collecting cylindrical precision
              grasps of a multi-fingered dexterous robotic hand.",
  journal  = "arXiv [cs.RO]",
  month    =  "7~" # feb,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Jean2017-nb,
  title         = "Does Neural Machine Translation Benefit from Larger Context?",
  author        = "Jean, Sebastien and Lauly, Stanislas and Firat, Orhan and
                   Cho, Kyunghyun",
  abstract      = "We propose a neural machine translation architecture that
                   models the surrounding text in addition to the source
                   sentence. These models lead to better performance, both in
                   terms of general translation quality and pronoun prediction,
                   when trained on small corpora, although this improvement
                   largely disappears when trained with a larger corpus. We
                   also discover that attention-based neural machine
                   translation is well suited for pronoun prediction and
                   compares favorably with other approaches that were
                   specifically designed for this task.",
  month         =  "17~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1704.05135"
}

@ARTICLE{Hosseini2017-ip,
  title         = "Google's Cloud Vision {API} Is Not Robust To Noise",
  author        = "Hosseini, Hossein and Xiao, Baicen and Poovendran, Radha",
  abstract      = "Google has recently introduced the Cloud Vision API for
                   image analysis. According to the demonstration website, the
                   API ``quickly classifies images into thousands of
                   categories, detects individual objects and faces within
                   images, and finds and reads printed words contained within
                   images.'' It can be also used to ``detect different types of
                   inappropriate content from adult to violent content.'' In
                   this paper, we evaluate the robustness of the Google's Cloud
                   Vision API to input perturbation. In particular, we show
                   that by adding sufficient noise to the image, the API
                   generates completely different outputs for the noisy image,
                   while a human observer would perceive its original content.
                   We show that the attack is consistently successful, by
                   performing extensive experiments on different image types,
                   including natural images, images containing faces and images
                   with texts. Our findings indicate the vulnerability of the
                   API in adversarial environments. For example, an adversary
                   can bypass an image filtering system by adding noise to an
                   image with inappropriate content.",
  month         =  "16~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1704.05051"
}

@ARTICLE{Dunn2017-jl,
  title         = "{SearchQA}: A New {Q\&A} Dataset Augmented with Context from
                   a Search Engine",
  author        = "Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney,
                   Ugur and Cirik, Volkan and Cho, Kyunghyun",
  abstract      = "We publicly release a new large-scale dataset, called
                   SearchQA, for machine comprehension, or question-answering.
                   Unlike recently released datasets, such as DeepMind
                   CNN/DailyMail and SQuAD, the proposed SearchQA was
                   constructed to reflect a full pipeline of general
                   question-answering. That is, we start not from an existing
                   article and generate a question-answer pair, but start from
                   an existing question-answer pair, crawled from J! Archive,
                   and augment it with text snippets retrieved by Google.
                   Following this approach, we built SearchQA, which consists
                   of more than 140k question-answer pairs with each pair
                   having 49.6 snippets on average. Each
                   question-answer-context tuple of the SearchQA comes with
                   additional meta-data such as the snippet's URL, which we
                   believe will be valuable resources for future research. We
                   conduct human evaluation as well as test two baseline
                   methods, one simple word selection and the other deep
                   learning based, on the SearchQA. We show that there is a
                   meaningful gap between the human and machine performances.
                   This suggests that the proposed dataset could well serve as
                   a benchmark for question-answering.",
  month         =  "18~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.05179"
}

@ARTICLE{Harispe2017-ds,
  title         = "Semantic Similarity from Natural Language and Ontology
                   Analysis",
  author        = "Harispe, S{\'e}bastien and Ranwez, Sylvie and Janaqi, Stefan
                   and Montmain, Jacky",
  abstract      = "Artificial Intelligence federates numerous scientific fields
                   in the aim of developing machines able to assist human
                   operators performing complex treatments -- most of which
                   demand high cognitive skills (e.g. learning or decision
                   processes). Central to this quest is to give machines the
                   ability to estimate the likeness or similarity between
                   things in the way human beings estimate the similarity
                   between stimuli. In this context, this book focuses on
                   semantic measures: approaches designed for comparing
                   semantic entities such as units of language, e.g. words,
                   sentences, or concepts and instances defined into knowledge
                   bases. The aim of these measures is to assess the similarity
                   or relatedness of such semantic entities by taking into
                   account their semantics, i.e. their meaning -- intuitively,
                   the words tea and coffee, which both refer to stimulating
                   beverage, will be estimated to be more semantically similar
                   than the words toffee (confection) and coffee, despite that
                   the last pair has a higher syntactic similarity. The two
                   state-of-the-art approaches for estimating and quantifying
                   semantic similarities/relatedness of semantic entities are
                   presented in detail: the first one relies on corpora
                   analysis and is based on Natural Language Processing
                   techniques and semantic models while the second is based on
                   more or less formal, computer-readable and workable forms of
                   knowledge such as semantic networks, thesaurus or
                   ontologies. (...) Beyond a simple inventory and
                   categorization of existing measures, the aim of this
                   monograph is to convey novices as well as researchers of
                   these domains towards a better understanding of semantic
                   similarity estimation and more generally semantic measures.",
  month         =  "18~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1704.05295"
}

@ARTICLE{See2017-zg,
  title         = "Get To The Point: Summarization with {Pointer-Generator}
                   Networks",
  author        = "See, Abigail and Liu, Peter J and Manning, Christopher D",
  abstract      = "Neural sequence-to-sequence models have provided a viable
                   new approach for abstractive text summarization (meaning
                   they are not restricted to simply selecting and rearranging
                   passages from the original text). However, these models have
                   two shortcomings: they are liable to reproduce factual
                   details inaccurately, and they tend to repeat themselves. In
                   this work we propose a novel architecture that augments the
                   standard sequence-to-sequence attentional model in two
                   orthogonal ways. First, we use a hybrid pointer-generator
                   network that can copy words from the source text via
                   pointing, which aids accurate reproduction of information,
                   while retaining the ability to produce novel words through
                   the generator. Second, we use coverage to keep track of what
                   has been summarized, which discourages repetition. We apply
                   our model to the CNN / Daily Mail summarization task,
                   outperforming the current abstractive state-of-the-art by at
                   least 2 ROUGE points.",
  month         =  "14~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.04368"
}

@ARTICLE{Olah_undated-tx,
  title    = "Attention and Augmented Recurrent Neural Networks",
  author   = "Olah, Chris and Carter, Shan",
  journal  = "Distill",
  volume   =  1,
  number   =  9,
  keywords = "Latently"
}

@INCOLLECTION{Sutskever2014-th,
  title     = "Sequence to Sequence Learning with Neural Networks",
  booktitle = "Advances in Neural Information Processing Systems 27",
  author    = "Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V",
  editor    = "Ghahramani, Z and Welling, M and Cortes, C and Lawrence, N D and
               Weinberger, K Q",
  publisher = "Curran Associates, Inc.",
  pages     = "3104--3112",
  year      =  2014,
  keywords  = "Latently"
}

@ARTICLE{Britz2017-hw,
  title         = "Massive Exploration of Neural Machine Translation
                   Architectures",
  author        = "Britz, Denny and Goldie, Anna and Luong, Minh-Thang and Le,
                   Quoc",
  abstract      = "Neural Machine Translation (NMT) has shown remarkable
                   progress over the past few years with production systems now
                   being deployed to end-users. One major drawback of current
                   architectures is that they are expensive to train, typically
                   requiring days to weeks of GPU time to converge. This makes
                   exhaustive hyperparameter search, as is commonly done with
                   other neural network architectures, prohibitively expensive.
                   In this work, we present the first large-scale analysis of
                   NMT architecture hyperparameters. We report empirical
                   results and variance numbers for several hundred
                   experimental runs, corresponding to over 250,000 GPU hours
                   on the standard WMT English to German translation task. Our
                   experiments lead to novel insights and practical advice for
                   building and extending NMT architectures. As part of this
                   contribution, we release an open-source NMT framework that
                   enables researchers to easily experiment with novel
                   techniques and reproduce state of the art results.",
  month         =  "11~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.03906"
}

@ARTICLE{Juefei-Xu2017-jq,
  title         = "Gang of {GANs}: Generative Adversarial Networks with Maximum
                   Margin Ranking",
  author        = "Juefei-Xu, Felix and Boddeti, Vishnu Naresh and Savvides,
                   Marios",
  abstract      = "Traditional generative adversarial networks (GAN) and many
                   of its variants are trained by minimizing the KL or
                   JS-divergence loss that measures how close the generated
                   data distribution is from the true data distribution. A
                   recent advance called the WGAN based on Wasserstein distance
                   can improve on the KL and JS-divergence based GANs, and
                   alleviate the gradient vanishing, instability, and mode
                   collapse issues that are common in the GAN training. In this
                   work, we aim at improving on the WGAN by first generalizing
                   its discriminator loss to a margin-based one, which leads to
                   a better discriminator, and in turn a better generator, and
                   then carrying out a progressive training paradigm involving
                   multiple GANs to contribute to the maximum margin ranking
                   loss so that the GAN at later stages will improve upon early
                   stages. We call this method Gang of GANs (GoGAN). We have
                   shown theoretically that the proposed GoGAN can reduce the
                   gap between the true data distribution and the generated
                   data distribution by at least half in an optimally trained
                   WGAN. We have also proposed a new way of measuring GAN
                   quality which is based on image completion tasks. We have
                   evaluated our method on four visual datasets: CelebA, LSUN
                   Bedroom, CIFAR-10, and 50K-SSFF, and have seen both visual
                   and quantitative improvement over baseline WGAN.",
  month         =  "17~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1704.04865"
}

@ARTICLE{Mandt2017-co,
  title         = "Stochastic Gradient Descent as Approximate Bayesian
                   Inference",
  author        = "Mandt, Stephan and Hoffman, Matthew D and Blei, David M",
  abstract      = "Stochastic Gradient Descent with a constant learning rate
                   (constant SGD) simulates a Markov chain with a stationary
                   distribution. With this perspective, we derive several new
                   results. (1) We show that constant SGD can be used as an
                   approximate Bayesian posterior inference algorithm.
                   Specifically, we show how to adjust the tuning parameters of
                   constant SGD to best match the stationary distribution to a
                   posterior, minimizing the Kullback-Leibler divergence
                   between these two distributions. (2) We demonstrate that
                   constant SGD gives rise to a new variational EM algorithm
                   that optimizes hyperparameters in complex probabilistic
                   models. (3) We also propose SGD with momentum for sampling
                   and show how to adjust the damping coefficient accordingly.
                   (4) We analyze MCMC algorithms. For Langevin Dynamics and
                   Stochastic Gradient Fisher Scoring, we quantify the
                   approximation errors due to finite learning rates. Finally
                   (5), we use the stochastic process perspective to give a
                   short proof of why Polyak averaging is optimal. Based on
                   this idea, we propose a scalable approximate MCMC algorithm,
                   the Averaged Stochastic Gradient Sampler.",
  month         =  "13~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1704.04289"
}

@ARTICLE{Lai_undated-je,
  title    = "Deep Laplacian Pyramid Networks for Fast and Accurate
              {Super-Resolution}",
  author   = "Lai, Wei-Sheng and Huang, Jia-Bin and Ahuja, Narendra and Yang,
              Ming-Hsuan",
  keywords = "Latently"
}

@ARTICLE{King2013-ld,
  title     = "Primecoin: Cryptocurrency with prime number proof-of-work",
  author    = "King, S",
  abstract  = "Abstract A new type of proof-of-work based on searching for
               prime numbers is introduced in peer-to-peer cryptocurrency
               designs. Three types of prime chains known as Cunningham chain
               of first kind, Cunningham chain of second kind and bi-twin chain
               are qualified as proof-",
  journal   = "July 7th",
  publisher = "Citeseer",
  year      =  2013,
  keywords  = "Latently"
}

@ARTICLE{E-mail_undated-dr,
  title    = "Hashcash - A Denial of Service {Counter-Measure}",
  author   = "e-mail:, Adam Back",
  keywords = "Latently"
}

@ARTICLE{Gao2017-ah,
  title         = "On the Properties of the Softmax Function with Application
                   in Game Theory and Reinforcement Learning",
  author        = "Gao, Bolin and Pavel, Lacra",
  abstract      = "In this paper, we utilize results from convex analysis and
                   monotone operator theory to derive additional properties of
                   the softmax function not yet covered in the existing
                   literature. In particular, we show that the softmax function
                   is the monotone gradient map of the log-sum-exp function. We
                   show that the inverse temperature parameter determines the
                   Lipschitz and co-coercivity properties of the softmax
                   function. We demonstrate the usefulness of these properties
                   through an application in population games and reinforcement
                   learning.",
  month         =  "3~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "math.OC",
  eprint        = "1704.00805"
}

@ARTICLE{noauthor_undated-sg,
  title    = "Learning from Demonstrations for Real World Reinforcement
              Learning",
  author   = "Hester, Todd and Vecerik, Matej and Pietquin, Olivier and
              Lanctot, Marc and Schaul, Tom and Piot, Bilal and Sendonaris,
              Andrew and Dulac-Arnold, Gabriel and Osband, Ian and Agapiou,
              John and Leibo, Joel Z and Gruslys, Audrunas",
  abstract = "Deep reinforcement learning (RL) has achieved several high
              profile successes in difficult control problems. However, these
              algorithms typically require a huge amount of data before they
              reach reasonable performance. In fact, their performance during
              learning can be extremely poor. This may be acceptable for a
              simulator, but it severely limits the applicability of deep RL to
              many real-world tasks, where the agent must learn in the real
              environment. In this paper we study a setting where the agent may
              access data from previous control of the system. We present an
              algorithm, Deep Q-learning from Demonstrations (DQfD), that
              leverages this data to massively accelerate the learning process
              even from relatively small amounts of demonstration data. DQfD
              works by combining temporal difference updates with large-margin
              classification of the demonstrator's actions. We show that DQfD
              has better initial performance than Deep Q-Networks (DQN) on 40
              of 42 Atari games and it receives more average rewards than DQN
              on 27 of 42 Atari games. We also demonstrate that DQfD learns
              faster than DQN even when given poor demonstration data.",
  journal  = "arXiv [cs.AI]",
  month    =  "12~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Ting-Hao2017-sg,
  title         = "Real-time {On-Demand} Crowd-powered Entity Extraction",
  author        = "{Ting-Hao} and {Huang} and Chen, Yun-Nung and Bigham,
                   Jeffrey P",
  abstract      = "Output-agreement mechanisms such as ESP Game have been
                   widely used in human computation to obtain reliable
                   human-generated labels. In this paper, we argue that a
                   ``time-limited'' output-agreement mechanism can be used to
                   create a fast and robust crowd-powered component in
                   interactive systems, particularly dialogue systems, to
                   extract key information from user utterances on the fly. Our
                   experiments on Amazon Mechanical Turk using the Airline
                   Travel Information System (ATIS) dataset showed that the
                   proposed approach achieves high-quality results with an
                   average response time shorter than 9 seconds.",
  month         =  "12~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.HC",
  eprint        = "1704.03627"
}

@ARTICLE{Turney2017-mk,
  title         = "Leveraging Term Banks for Answering Complex Questions: A
                   Case for Sparse Vectors",
  author        = "Turney, Peter D",
  abstract      = "While open-domain question answering (QA) systems have
                   proven effective for answering simple questions, they
                   struggle with more complex questions. Our goal is to answer
                   more complex questions reliably, without incurring a
                   significant cost in knowledge resource construction to
                   support the QA. One readily available knowledge resource is
                   a term bank, enumerating the key concepts in a domain. We
                   have developed an unsupervised learning approach that
                   leverages a term bank to guide a QA system, by representing
                   the terminological knowledge with thousands of specialized
                   vector spaces. In experiments with complex science
                   questions, we show that this approach significantly
                   outperforms several state-of-the-art QA systems,
                   demonstrating that significant leverage can be gained from
                   continuous vector representations of domain terminology.",
  month         =  "11~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.IR",
  eprint        = "1704.03543"
}

@ARTICLE{Yu2017-cs,
  title         = "Recognizing Multi-talker Speech with Permutation Invariant
                   Training",
  author        = "Yu, Dong and Chang, Xuankai and Qian, Yanmin",
  abstract      = "In this paper, we propose a novel technique for direct
                   recognition of multiple speech streams given the single
                   channel of mixed speech, without first separating them. Our
                   technique is based on permutation invariant training (PIT)
                   for automatic speech recognition (ASR). In PIT-ASR, we
                   compute the average cross entropy (CE) over all frames in
                   the whole utterance for each possible output-target
                   assignment, pick the one with the minimum CE, and optimize
                   for that assignment. PIT-ASR forces all the frames of the
                   same speaker to be aligned with the same output layer. This
                   strategy elegantly solves the label permutation problem and
                   speaker tracing problem in one shot. Our experiments on
                   artificially mixed AMI data showed that the proposed
                   approach is very promising.",
  month         =  "22~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.SD",
  eprint        = "1704.01985"
}

@ARTICLE{noauthor_undated-ys,
  title    = "Massively parallel {MCMC} for Bayesian hierarchical models",
  author   = "Goudie, Robert J B and Turner, Rebecca M and De Angelis, Daniela
              and Thomas, Andrew",
  abstract = "We propose a simple, generic algorithm for parallelising Markov
              chain Monte Carlo (MCMC) algorithms for posterior inference of
              Bayesian hierarchical models. Our algorithm parallelises
              evaluation of the product-form likelihoods formed when a
              parameter has many children in the hierarchical model; and
              parallelises sampling of conditionally-independent sets of
              parameters. A simple heuristic algorithm is used to decide which
              approach to use for each parameter and to apportion computation
              across computational cores. The algorithm enables automatic
              parallelisation of the broad range of statistical models that can
              be fitted using BUGS-language software, making the dramatic
              speed-ups of modern multi-core computing accessible to applied
              statisticians, without requiring any experience of parallel
              programming. We demonstrate our approach using our
              freely-available open source implementation, called MultiBUGS
              (https://github.com/MultiBUGS/MultiBUGS), on simulated data
              designed to mimic a hierarchical e-health linked-data study of
              methadone prescriptions including 425,112 observations and 20,426
              random effects. Reliable posterior inference for this model takes
              several hours in existing software, but MultiBUGS can perform
              inference in only 29 minutes using 48 computational cores.",
  journal  = "arXiv [stat.CO]",
  month    =  "11~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Figueiredo2017-zl,
  title         = "struc2vec: Learning Node Representations from Structural
                   Identity",
  author        = "Figueiredo, Daniel R and Ribeiro, Leonardo F R and Saverese,
                   Pedro H P",
  abstract      = "Structural identity is a concept of symmetry in which
                   network nodes are identified according to the network
                   structure and their relationship to other nodes. Structural
                   identity has been studied in theory and practice over the
                   past decades, but has only recently been addressed with
                   techniques from representational learning. This work
                   presents struc2vec, a novel and flexible framework for
                   learning latent representations of node's structural
                   identity. struc2vec assesses structural similarity without
                   using node or edge attributes, uses a hierarchy to measure
                   similarity at different scales, and constructs a multilayer
                   graph to encode the structural similarities and generate
                   structural context for nodes. Numerical experiments indicate
                   that state-of-the-art techniques for learning node
                   representations fail in capturing stronger notions of
                   structural identity, while struc2vec exhibits much superior
                   performance in this task, as it overcomes limitations of
                   prior techniques.",
  month         =  "11~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.SI",
  eprint        = "1704.03165"
}

@ARTICLE{Munkhdalai2016-bi,
  title         = "Neural Tree Indexers for Text Understanding",
  author        = "Munkhdalai, Tsendsuren and Yu, Hong",
  abstract      = "Recurrent neural networks (RNNs) process input text
                   sequentially and model the conditional transition between
                   word tokens. In contrast, the advantages of recursive
                   networks include that they explicitly model the
                   compositionality and the recursive structure of natural
                   language. However, the current recursive architecture is
                   limited by its dependence on syntactic tree. In this paper,
                   we introduce a robust syntactic parsing-independent tree
                   structured model, Neural Tree Indexers (NTI) that provides a
                   middle ground between the sequential RNNs and the syntactic
                   treebased recursive models. NTI constructs a full n-ary tree
                   by processing the input text with its node function in a
                   bottom-up fashion. Attention mechanism can then be applied
                   to both structure and node function. We implemented and
                   evaluated a binarytree model of NTI, showing the model
                   achieved the state-of-the-art performance on three different
                   NLP tasks: natural language inference, answer sentence
                   selection, and sentence classification, outperforming
                   state-of-the-art recurrent and recursive neural networks.",
  month         =  "15~" # jul,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1607.04492"
}

@ARTICLE{noauthor_undated-qt,
  title    = "Conceptualization Topic Modeling",
  author   = "Tang, Yi-Kun and Mao, Xian-Ling and Huang, Heyan and Wen, Guihua",
  abstract = "Recently, topic modeling has been widely used to discover the
              abstract topics in text corpora. Most of the existing topic
              models are based on the assumption of three-layer hierarchical
              Bayesian structure, i.e. each document is modeled as a
              probability distribution over topics, and each topic is a
              probability distribution over words. However, the assumption is
              not optimal. Intuitively, it's more reasonable to assume that
              each topic is a probability distribution over concepts, and then
              each concept is a probability distribution over words, i.e.
              adding a latent concept layer between topic layer and word layer
              in traditional three-layer assumption. In this paper, we verify
              the proposed assumption by incorporating the new assumption in
              two representative topic models, and obtain two novel topic
              models. Extensive experiments were conducted among the proposed
              models and corresponding baselines, and the results show that the
              proposed models significantly outperform the baselines in terms
              of case study and perplexity, which means the new assumption is
              more reasonable than traditional one.",
  journal  = "arXiv [cs.CL]",
  month    =  "7~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Zayats2017-od,
  title         = "Conversation Modeling on Reddit using a {Graph-Structured}
                   {LSTM}",
  author        = "Zayats, Vicky and Ostendorf, Mari",
  abstract      = "This paper presents a novel approach for modeling threaded
                   discussions on social media using a graph-structured
                   bidirectional LSTM which represents both hierarchical and
                   temporal conversation structure. In experiments with a task
                   of predicting popularity of comments in Reddit discussions,
                   the proposed model outperforms a node-independent
                   architecture for different sets of input features. Analyses
                   show a benefit to the model over the full course of the
                   discussion, improving detection in both early and late
                   stages. Further, the use of language cues with the
                   bidirectional tree state updates helps with identifying
                   controversial comments.",
  month         =  "7~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.02080"
}

@ARTICLE{Tellez2017-jb,
  title         = "An Automated Text Categorization Framework based on
                   Hyperparameter Optimization",
  author        = "Tellez, Eric S and Moctezuma, Daniela and
                   Miranda-J{\'\i}menez, Sabino and Graff, Mario",
  abstract      = "The amount of textual data generated in environments such as
                   social media, blogs, online newspapers, and so on, have
                   attracted the attention of the scientific community in order
                   to automatize and improve several tasks that were manually
                   performed such as sentiment analysis, user profiling, or
                   text categorization, just to mention a few. Fortunately,
                   several of these activities can be posed as a classification
                   problem, i.e., a problem where one is interested in
                   developing a function, from a set of texts with associated
                   labels, capable of predicting a label given an unseen text.
                   In this contribution, we propose a text classifier, named
                   $\mu$TC. $\mu$TC is composed of a number of easy to
                   implement text transformation, text representation and a
                   machine learning algorithm that produce a competitive
                   classifier even over informal written text when these parts
                   are correctly configured. We provide a detailed description
                   of $\mu$TC along with an extensive experimental comparison
                   with the relevant state-of-the-art methods. $\mu$TC was
                   compared on 30 different datasets obtaining the best
                   performance (regarding accuracy) in 18 of them. The
                   different datasets include several problems like topic and
                   polarity classification, spam detection, user profiling and
                   authorship attribution. Furthermore, it is important to
                   comment that our approach allows the usage of the technology
                   even for users without knowledge of machine learning and
                   natural language processing.",
  month         =  "6~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.01975"
}

@ARTICLE{noauthor_undated-yz,
  title    = "A Constrained {Sequence-to-Sequence} Neural Model for Sentence
              Simplification",
  author   = "Zhang, Yaoyuan and Ye, Zhenxu and Feng, Yansong and Zhao, Dongyan
              and Yan, Rui",
  abstract = "Sentence simplification reduces semantic complexity to benefit
              people with language impairments. Previous simplification studies
              on the sentence level and word level have achieved promising
              results but also meet great challenges. For sentence-level
              studies, sentences after simplification are fluent but sometimes
              are not really simplified. For word-level studies, words are
              simplified but also have potential grammar errors due to
              different usages of words before and after simplification. In
              this paper, we propose a two-step simplification framework by
              combining both the word-level and the sentence-level
              simplifications, making use of their corresponding advantages.
              Based on the two-step framework, we implement a novel constrained
              neural generation model to simplify sentences given simplified
              words. The final results on Wikipedia and Simple Wikipedia
              aligned datasets indicate that our method yields better
              performance than various baselines.",
  journal  = "arXiv [cs.CL]",
  month    =  "7~" # apr,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Nalisnick2016-xo,
  title         = "{Stick-Breaking} Variational Autoencoders",
  author        = "Nalisnick, Eric and Smyth, Padhraic",
  abstract      = "We extend Stochastic Gradient Variational Bayes to perform
                   posterior inference for the weights of Stick-Breaking
                   processes. This development allows us to define a
                   Stick-Breaking Variational Autoencoder (SB-VAE), a Bayesian
                   nonparametric version of the variational autoencoder that
                   has a latent representation with stochastic dimensionality.
                   We experimentally demonstrate that the SB-VAE, and a
                   semi-supervised variant, learn highly discriminative latent
                   representations that often outperform the Gaussian VAE's.",
  month         =  "20~" # may,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1605.06197"
}

@ARTICLE{Salle2017-nl,
  title         = "Restricted Recurrent Neural Tensor Networks",
  author        = "Salle, Alexandre and Villavicencio, Aline",
  abstract      = "Increasing the capacity of recurrent neural networks (RNN)
                   usually involves augmenting the size of the hidden layer,
                   resulting in a significant increase of computational cost.
                   An alternative is the recurrent neural tensor network
                   (RNTN), which increases capacity by employing distinct
                   hidden layer weights for each vocabulary word. The
                   disadvantage of RNTNs is that memory usage scales linearly
                   with vocabulary size, which can reach millions for
                   word-level language models. In this paper, we introduce
                   restricted recurrent neural tensor networks (r-RNTN) which
                   reserve distinct hidden layer weights for frequent
                   vocabulary words while sharing a single set of weights for
                   infrequent words. Perplexity evaluations using the Penn
                   Treebank corpus show that r-RNTNs improve language model
                   performance over standard RNNs using only a small fraction
                   of the parameters of unrestricted RNTNs.",
  month         =  "3~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.00774"
}

@ARTICLE{Ge2017-fg,
  title         = "No Spurious Local Minima in Nonconvex Low Rank Problems: A
                   Unified Geometric Analysis",
  author        = "Ge, Rong and Jin, Chi and Zheng, Yi",
  abstract      = "In this paper we develop a new framework that captures the
                   common landscape underlying the common non-convex low-rank
                   matrix problems including matrix sensing, matrix completion
                   and robust PCA. In particular, we show for all above
                   problems (including asymmetric cases): 1) all local minima
                   are also globally optimal; 2) no high-order saddle points
                   exists. These results explain why simple algorithms such as
                   stochastic gradient descent have global converge, and
                   efficiently optimize these non-convex objective functions in
                   practice. Our framework connects and simplifies the existing
                   analyses on optimization landscapes for matrix sensing and
                   symmetric matrix completion. The framework naturally leads
                   to new results for asymmetric matrix completion and robust
                   PCA.",
  month         =  "3~" # apr,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1704.00708"
}

@MISC{noauthor_undated-iy,
  title        = "Home - schema.org",
  abstract     = "Schema.org is a set of extensible schemas that enables
                  webmasters to embed structured data on their web pages for
                  use by search engines and other applications.",
  howpublished = "\url{http://schema.org/}",
  note         = "Accessed: 2017-4-4",
  keywords     = "Latently"
}

@ARTICLE{Chen2017-cd,
  title         = "Reading Wikipedia to Answer {Open-Domain} Questions",
  author        = "Chen, Danqi and Fisch, Adam and Weston, Jason and Bordes,
                   Antoine",
  abstract      = "This paper proposes to tackle open-domain question answering
                   using Wikipedia as the unique knowledge source: the answer
                   to any factoid question is a text span in a Wikipedia
                   article. This task of machine reading at scale combines the
                   challenges of document retrieval - finding the relevant
                   articles - with that of machine comprehension of text -
                   identifying the answer spans from those articles. Our
                   approach combines a search component based on bigram hashing
                   and TF-IDF matching with a multi-layer recurrent neural
                   network model trained to detect answers in Wikipedia
                   paragraphs. Our experiments on multiple existing QA datasets
                   indicate that (1) both modules are highly competitive with
                   respect to existing counterparts and (2) multitask learning
                   using distant supervision on their combination is an
                   effective complete system on this challenging task.",
  month         =  "31~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1704.00051"
}

@ARTICLE{noauthor_undated-qx,
  title    = "Sentence Simplification with Deep Reinforcement Learning",
  author   = "Zhang, Xingxing and Lapata, Mirella",
  abstract = "Sentence simplification aims to make sentences easier to read and
              understand. Most recent approaches draw on insights from machine
              translation to learn simplification rewrites from monolingual
              corpora of complex and simple sentences. We address the
              simplification problem with an encoder-decoder model coupled with
              a deep reinforcement learning framework. Our model explores the
              space of possible simplifications while learning to optimize a
              reward function that encourages outputs which are simple, fluent,
              and preserve the meaning of the input. Experiments on three
              datasets demonstrate that our model brings significant
              improvements over the state of the art.",
  journal  = "arXiv [cs.CL]",
  month    =  "31~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@INCOLLECTION{Socher2013-az,
  title     = "Reasoning With Neural Tensor Networks for Knowledge Base
               Completion",
  booktitle = "Advances in Neural Information Processing Systems 26",
  author    = "Socher, Richard and Chen, Danqi and Manning, Christopher D and
               Ng, Andrew",
  editor    = "Burges, C J C and Bottou, L and Welling, M and Ghahramani, Z and
               Weinberger, K Q",
  publisher = "Curran Associates, Inc.",
  pages     = "926--934",
  year      =  2013,
  keywords  = "Latently"
}

@ARTICLE{noauthor_undated-gt,
  title    = "{TokTrack}: A Complete Token Provenance and Change Tracking
              Dataset for the English Wikipedia",
  author   = "Fl{\"o}ck, Fabian and Erdogan, Kenan and Acosta, Maribel",
  abstract = "We present a dataset that contains every instance of all tokens
              (~ words) ever written in undeleted, non-redirect English
              Wikipedia articles until October 2016, in total 13,545,349,787
              instances. Each token is annotated with (i) the article revision
              it was originally created in, and (ii) lists with all the
              revisions in which the token was ever deleted and (potentially)
              re-added and re-deleted from its article, enabling a complete and
              straightforward tracking of its history. This data would be
              exceedingly hard to create by an average potential user as it is
              (i) very expensive to compute and as (ii) accurately tracking the
              history of each token in revisioned documents is a non-trivial
              task. Adapting a state-of-the-art algorithm, we have produced a
              dataset that allows for a range of analyses and metrics, already
              popular in research and going beyond, to be generated on
              complete-Wikipedia scale; ensuring quality and allowing
              researchers to forego expensive text-comparison computation,
              which so far has hindered scalable usage. We show how this data
              enables, on token-level, computation of provenance, measuring
              survival of content over time, very detailed conflict metrics,
              and fine-grained interactions of editors like partial reverts,
              re-additions and other metrics, in the process gaining several
              novel insights.",
  journal  = "arXiv [cs.CL]",
  month    =  "23~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Ororbia2017-rq,
  title         = "Learning Simpler Language Models with the Delta Recurrent
                   Neural Network Framework",
  author        = "Ororbia, II, Alexander G and Mikolov, Tomas and Reitter,
                   David",
  abstract      = "Learning useful information across long time lags is a
                   critical and difficult problem for temporal neural models in
                   tasks like language modeling. Existing architectures that
                   address the issue are often complex and costly to train. The
                   Delta Recurrent Neural Network (Delta-RNN) framework is a
                   simple and high-performing design that unifies previously
                   proposed gated neural models. The Delta-RNN models maintain
                   longer-term memory by learning to interpolate between a
                   fast-changing data-driven representation and a slowly
                   changing, implicitly stable state. This requires hardly any
                   more parameters than a classical simple recurrent network.
                   The models outperform popular complex architectures, such as
                   the Long Short Term Memory (LSTM) and the Gated Recurrent
                   Unit (GRU) and achieve state-of-the art performance in
                   language modeling at character and word levels and yield
                   comparable performance at the subword level.",
  month         =  "26~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.08864"
}

@ARTICLE{Mikolov2014-gr,
  title         = "Learning Longer Memory in Recurrent Neural Networks",
  author        = "Mikolov, Tomas and Joulin, Armand and Chopra, Sumit and
                   Mathieu, Michael and Ranzato, Marc'aurelio",
  abstract      = "Recurrent neural network is a powerful model that learns
                   temporal patterns in sequential data. For a long time, it
                   was believed that recurrent networks are difficult to train
                   using simple optimizers, such as stochastic gradient
                   descent, due to the so-called vanishing gradient problem. In
                   this paper, we show that learning longer term patterns in
                   real data, such as in natural language, is perfectly
                   possible using gradient descent. This is achieved by using a
                   slight structural modification of the simple recurrent
                   neural network architecture. We encourage some of the hidden
                   units to change their state slowly by making part of the
                   recurrent weight matrix close to identity, thus forming kind
                   of a longer term memory. We evaluate our model in language
                   modeling experiments, where we obtain similar performance to
                   the much more complex Long Short Term Memory (LSTM) networks
                   (Hochreiter \& Schmidhuber, 1997).",
  month         =  "24~" # dec,
  year          =  2014,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1412.7753"
}

@INPROCEEDINGS{Collobert2008-gg,
  title     = "A Unified Architecture for Natural Language Processing: Deep
               Neural Networks with Multitask Learning",
  booktitle = "Proceedings of the 25th International Conference on Machine
               Learning",
  author    = "Collobert, Ronan and Weston, Jason",
  publisher = "ACM",
  pages     = "160--167",
  series    = "ICML '08",
  year      =  2008,
  address   = "New York, NY, USA",
  keywords  = "Latently"
}

@ARTICLE{Zhu2017-dg,
  title         = "Unpaired {Image-to-Image} Translation using
                   {Cycle-Consistent} Adversarial Networks",
  author        = "Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and Efros,
                   Alexei A",
  abstract      = "Image-to-image translation is a class of vision and graphics
                   problems where the goal is to learn the mapping between an
                   input image and an output image using a training set of
                   aligned image pairs. However, for many tasks, paired
                   training data will not be available. We present an approach
                   for learning to translate an image from a source domain $X$
                   to a target domain $Y$ in the absence of paired examples.
                   Our goal is to learn a mapping $G: X \rightarrow Y$ such
                   that the distribution of images from $G(X)$ is
                   indistinguishable from the distribution $Y$ using an
                   adversarial loss. Because this mapping is highly
                   under-constrained, we couple it with an inverse mapping $F:
                   Y \rightarrow X$ and introduce a cycle consistency loss to
                   push $F(G(X)) \approx X$ (and vice versa). Qualitative
                   results are presented on several tasks where paired training
                   data does not exist, including collection style transfer,
                   object transfiguration, season transfer, photo enhancement,
                   etc. Quantitative comparisons against several prior methods
                   demonstrate the superiority of our approach.",
  month         =  "30~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1703.10593"
}

@ARTICLE{Kartsaklis2017-cm,
  title         = "Linguistic Matrix Theory",
  author        = "Kartsaklis, Dimitrios and Ramgoolam, Sanjaye and Sadrzadeh,
                   Mehrnoosh",
  abstract      = "Recent research in computational linguistics has developed
                   algorithms which associate matrices with adjectives and
                   verbs, based on the distribution of words in a corpus of
                   text. These matrices are linear operators on a vector space
                   of context words. They are used to construct the meaning of
                   composite expressions from that of the elementary
                   constituents, forming part of a compositional distributional
                   approach to semantics. We propose a Matrix Theory approach
                   to this data, based on permutation symmetry along with
                   Gaussian weights and their perturbations. A simple Gaussian
                   model is tested against word matrices created from a large
                   corpus of text. We characterize the cubic and quartic
                   departures from the model, which we propose, alongside the
                   Gaussian parameters, as signatures for comparison of
                   linguistic corpora. We propose that perturbed Gaussian
                   models with permutation symmetry provide a promising
                   framework for characterizing the nature of universality in
                   the statistical properties of word matrices. The matrix
                   theory framework developed here exploits the view of
                   statistics as zero dimensional perturbative quantum field
                   theory. It perceives language as a physical system realizing
                   a universality class of matrix statistics characterized by
                   permutation symmetry.",
  month         =  "28~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.10252"
}

@ARTICLE{Fernando2017-vx,
  title         = "{PathNet}: Evolution Channels Gradient Descent in Super
                   Neural Networks",
  author        = "Fernando, Chrisantha and Banarse, Dylan and Blundell,
                   Charles and Zwols, Yori and Ha, David and Rusu, Andrei A and
                   Pritzel, Alexander and Wierstra, Daan",
  abstract      = "For artificial general intelligence (AGI) it would be
                   efficient if multiple users trained the same giant neural
                   network, permitting parameter reuse, without catastrophic
                   forgetting. PathNet is a first step in this direction. It is
                   a neural network algorithm that uses agents embedded in the
                   neural network whose task is to discover which parts of the
                   network to re-use for new tasks. Agents are pathways (views)
                   through the network which determine the subset of parameters
                   that are used and updated by the forwards and backwards
                   passes of the backpropogation algorithm. During learning, a
                   tournament selection genetic algorithm is used to select
                   pathways through the neural network for replication and
                   mutation. Pathway fitness is the performance of that pathway
                   measured according to a cost function. We demonstrate
                   successful transfer learning; fixing the parameters along a
                   path learned on task A and re-evolving a new population of
                   paths for task B, allows task B to be learned faster than it
                   could be learned from scratch or after fine-tuning. Paths
                   evolved on task B re-use parts of the optimal path evolved
                   on task A. Positive transfer was demonstrated for binary
                   MNIST, CIFAR, and SVHN supervised learning classification
                   tasks, and a set of Atari and Labyrinth reinforcement
                   learning tasks, suggesting PathNets have general
                   applicability for neural network training. Finally, PathNet
                   also significantly improves the robustness to hyperparameter
                   choices of a parallel asynchronous reinforcement learning
                   algorithm (A3C).",
  month         =  "30~" # jan,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1701.08734"
}

@ARTICLE{Salimans2017-db,
  title         = "Evolution Strategies as a Scalable Alternative to
                   Reinforcement Learning",
  author        = "Salimans, Tim and Ho, Jonathan and Chen, Xi and Sutskever,
                   Ilya",
  abstract      = "We explore the use of Evolution Strategies, a class of black
                   box optimization algorithms, as an alternative to popular RL
                   techniques such as Q-learning and Policy Gradients.
                   Experiments on MuJoCo and Atari show that ES is a viable
                   solution strategy that scales extremely well with the number
                   of CPUs available: By using hundreds to thousands of
                   parallel workers, ES can solve 3D humanoid walking in 10
                   minutes and obtain competitive results on most Atari games
                   after one hour of training time. In addition, we highlight
                   several advantages of ES as a black box optimization
                   technique: it is invariant to action frequency and delayed
                   rewards, tolerant of extremely long horizons, and does not
                   need temporal discounting or value function approximation.",
  month         =  "10~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1703.03864"
}

@ARTICLE{Gan2016-tl,
  title         = "Semantic Compositional Networks for Visual Captioning",
  author        = "Gan, Zhe and Gan, Chuang and He, Xiaodong and Pu, Yunchen
                   and Tran, Kenneth and Gao, Jianfeng and Carin, Lawrence and
                   Deng, Li",
  abstract      = "A Semantic Compositional Network (SCN) is developed for
                   image captioning, in which semantic concepts (i.e., tags)
                   are detected from the image, and the probability of each tag
                   is used to compose the parameters in a long short-term
                   memory (LSTM) network. The SCN extends each weight matrix of
                   the LSTM to an ensemble of tag-dependent weight matrices.
                   The degree to which each member of the ensemble is used to
                   generate an image caption is tied to the image-dependent
                   probability of the corresponding tag. In addition to
                   captioning images, we also extend the SCN to generate
                   captions for video clips. We qualitatively analyze semantic
                   composition in SCNs, and quantitatively evaluate the
                   algorithm on three benchmark datasets: COCO, Flickr30k, and
                   Youtube2Text. Experimental results show that the proposed
                   method significantly outperforms prior state-of-the-art
                   approaches, across multiple evaluation metrics.",
  month         =  "23~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1611.08002"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{Nanalyze2017-ft,
  title        = "Brain Computer Interfaces - 10 Startups to Watch - Nanalyze",
  booktitle    = "Nanalyze",
  author       = "{Nanalyze}",
  abstract     = "It’s only a matter of time before we’re able to control
                  things with our minds using brain computer interfaces, even
                  communicate to each other just using our minds. Think
                  telepathy is just fiction? It’s not. Facebook is actually
                  hiring neuroscientists to build brain computer interfaces so
                  that all you people who spend half your waking time …",
  month        =  "5~" # feb,
  year         =  2017,
  howpublished = "\url{http://www.nanalyze.com/2017/02/brain-computer-interfaces-10-startups/}",
  note         = "Accessed: 2017-3-29",
  keywords     = "Latently"
}

@ARTICLE{noauthor_undated-iu,
  title    = "Early Stopping without a Validation Set",
  author   = "Mahsereci, Maren and Balles, Lukas and Lassner, Christoph and
              Hennig, Philipp",
  abstract = "Early stopping is a widely used technique to prevent poor
              generalization performance when training an over-expressive model
              by means of gradient-based optimization. To find a good point to
              halt the optimizer, a common practice is to split the dataset
              into a training and a smaller validation set to obtain an ongoing
              estimate of the generalization performance. In this paper we
              propose a novel early stopping criterion which is based on
              fast-to-compute, local statistics of the computed gradients and
              entirely removes the need for a held-out validation set. Our
              experiments show that this is a viable approach in the setting of
              least-squares and logistic regression as well as neural networks.",
  journal  = "arXiv [cs.LG]",
  month    =  "28~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Draelos2016-or,
  title         = "Neurogenesis Deep Learning",
  author        = "Draelos, Timothy J and Miner, Nadine E and Lamb, Christopher
                   C and Cox, Jonathan A and Vineyard, Craig M and Carlson,
                   Kristofor D and Severa, William M and James, Conrad D and
                   Aimone, James B",
  abstract      = "Neural machine learning methods, such as deep neural
                   networks (DNN), have achieved remarkable success in a number
                   of complex data processing tasks. These methods have
                   arguably had their strongest impact on tasks such as image
                   and audio processing - data processing domains in which
                   humans have long held clear advantages over conventional
                   algorithms. In contrast to biological neural systems, which
                   are capable of learning continuously, deep artificial
                   networks have a limited ability for incorporating new
                   information in an already trained network. As a result,
                   methods for continuous learning are potentially highly
                   impactful in enabling the application of deep networks to
                   dynamic data sets. Here, inspired by the process of adult
                   neurogenesis in the hippocampus, we explore the potential
                   for adding new neurons to deep layers of artificial neural
                   networks in order to facilitate their acquisition of novel
                   information while preserving previously trained data
                   representations. Our results on the MNIST handwritten digit
                   dataset and the NIST SD 19 dataset, which includes lower and
                   upper case letters and digits, demonstrate that neurogenesis
                   is well suited for addressing the stability-plasticity
                   dilemma that has long challenged adaptive machine learning
                   algorithms.",
  month         =  "12~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1612.03770"
}

@ARTICLE{Baroni2017-io,
  title         = "{CommAI}: Evaluating the first steps towards a useful
                   general {AI}",
  author        = "Baroni, Marco and Joulin, Armand and Jabri, Allan and
                   Kruszewski, Germ{\`a}n and Lazaridou, Angeliki and Simonic,
                   Klemen and Mikolov, Tomas",
  abstract      = "With machine learning successfully applied to new daunting
                   problems almost every day, general AI starts looking like an
                   attainable goal. However, most current research focuses
                   instead on important but narrow applications, such as image
                   classification or machine translation. We believe this to be
                   largely due to the lack of objective ways to measure
                   progress towards broad machine intelligence. In order to
                   fill this gap, we propose here a set of concrete desiderata
                   for general AI, together with a platform to test machines on
                   how well they satisfy such desiderata, while keeping all
                   further complexities to a minimum.",
  month         =  "31~" # jan,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1701.08954"
}

@ARTICLE{noauthor_undated-qa,
  title    = "A Tidy Data Model for Natural Language Processing using
              {cleanNLP}",
  author   = "Arnold, Taylor",
  abstract = "The package cleanNLP provides a set of fast tools for converting
              a textual corpus into a set of normalized tables. The underlying
              natural language processing pipeline utilizes Stanford's CoreNLP
              library, exposing a number of annotation tasks for text written
              in English, French, German, and Spanish. Annotators include
              tokenization, part of speech tagging, named entity recognition,
              entity linking, sentiment analysis, dependency parsing,
              coreference resolution, and information extraction.",
  journal  = "arXiv [cs.CL]",
  month    =  "27~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Steiner2016-mj,
  title         = "Wikipedia Tools for Google Spreadsheets",
  author        = "Steiner, Thomas",
  abstract      = "In this paper, we introduce the Wikipedia Tools for Google
                   Spreadsheets. Google Spreadsheets is part of a free,
                   Web-based software office suite offered by Google within its
                   Google Docs service. It allows users to create and edit
                   spreadsheets online, while collaborating with other users in
                   realtime. Wikipedia is a free-access, free-content Internet
                   encyclopedia, whose content and data is available, among
                   other means, through an API. With the Wikipedia Tools for
                   Google Spreadsheets, we have created a toolkit that
                   facilitates working with Wikipedia data from within a
                   spreadsheet context. We make these tools available as
                   open-source on GitHub
                   [https://github.com/tomayac/wikipedia-tools-for-google-spreadsheets],
                   released under the permissive Apache 2.0 license.",
  month         =  "8~" # feb,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.IR",
  eprint        = "1602.02506"
}

@ARTICLE{noauthor_undated-yg,
  title    = "Failures of Deep Learning",
  author   = "Shalev-Shwartz, Shai and Shamir, Ohad and Shammah, Shaked",
  abstract = "In recent years, Deep Learning has become the go-to solution for
              a broad range of applications, often outperforming
              state-of-the-art. However, it is important, for both
              theoreticians and practitioners, to gain a deeper understanding
              of the difficulties and limitations associated with common
              approaches and algorithms. We describe four families of problems
              for which some of the commonly used existing algorithms fail or
              suffer significant difficulty. We illustrate the failures through
              practical experiments, and provide theoretical insights
              explaining their source, and how they might be remedied.",
  journal  = "arXiv [cs.LG]",
  month    =  "23~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Schmidhuber2015-ve,
  title       = "Deep learning in neural networks: an overview",
  author      = "Schmidhuber, J{\"u}rgen",
  affiliation = "Swiss AI Lab IDSIA, Istituto Dalle Molle di Studi
                 sull'Intelligenza Artificiale, University of Lugano \& SUPSI,
                 Galleria 2, 6928 Manno-Lugano, Switzerland. juergen@idsia.ch",
  abstract    = "In recent years, deep artificial neural networks (including
                 recurrent ones) have won numerous contests in pattern
                 recognition and machine learning. This historical survey
                 compactly summarizes relevant work, much of it from the
                 previous millennium. Shallow and Deep Learners are
                 distinguished by the depth of their credit assignment paths,
                 which are chains of possibly learnable, causal links between
                 actions and effects. I review deep supervised learning (also
                 recapitulating the history of backpropagation), unsupervised
                 learning, reinforcement learning \& evolutionary computation,
                 and indirect search for short programs encoding deep and large
                 networks.",
  journal     = "Neural Netw.",
  volume      =  61,
  pages       = "85--117",
  month       =  jan,
  year        =  2015,
  keywords    = "Latently",
  language    = "en"
}

@ARTICLE{Osband2017-kr,
  title         = "Deep Exploration via Randomized Value Functions",
  author        = "Osband, Ian and Russo, Daniel and Wen, Zheng and Van Roy,
                   Benjamin",
  abstract      = "We study the use of randomized value functions to guide deep
                   exploration in reinforcement learning. This offers an
                   elegant means for synthesizing statistically and
                   computationally efficient exploration with common practical
                   approaches to value function learning. We present several
                   reinforcement learning algorithms that leverage randomized
                   value functions and demonstrate their efficacy through
                   computational studies. We also prove a regret bound that
                   establishes statistical efficiency with a tabular
                   representation.",
  month         =  "22~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1703.07608"
}

@ARTICLE{Britz2017-zx,
  title         = "Massive Exploration of Neural Machine Translation
                   Architectures",
  author        = "Britz, Denny and Goldie, Anna and Luong, Minh-Thang and Le,
                   Quoc",
  abstract      = "Neural Machine Translation (NMT) has shown remarkable
                   progress over the past few years with production systems now
                   being deployed to end-users. One major drawback of current
                   architectures is that they are expensive to train, typically
                   requiring days to weeks of GPU time to converge. This makes
                   exhaustive hyperparameter search, as is commonly done with
                   other neural network architectures, prohibitively expensive.
                   In this work, we present the first large-scale analysis of
                   NMT architecture hyperparameters. We report empirical
                   results and variance numbers for several hundred
                   experimental runs, corresponding to over 250,000 GPU hours
                   on the standard WMT English to German translation task. Our
                   experiments lead to novel insights and practical advice for
                   building and extending NMT architectures. As part of this
                   contribution, we release an open-source NMT framework that
                   enables researchers to easily experiment with novel
                   techniques and reproduce state of the art results.",
  month         =  "11~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.03906"
}

@ARTICLE{Schneider2017-kg,
  title         = "The {NLTK} {FrameNet} {API}: Designing for Discoverability
                   with a Rich Linguistic Resource",
  author        = "Schneider, Nathan and Wooters, Chuck",
  abstract      = "A new Python API, integrated within the NLTK suite, offers
                   access to the FrameNet 1.7 lexical database. The lexicon
                   (structured in terms of frames) as well as annotated
                   sentences can be processed programatically, or browsed with
                   human-readable displays via the interactive Python prompt.",
  month         =  "21~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.07438"
}

@ARTICLE{Cho2014-qb,
  title         = "Learning Phrase Representations using {RNN}
                   {Encoder-Decoder} for Statistical Machine Translation",
  author        = "Cho, Kyunghyun and van Merrienboer, Bart and Gulcehre,
                   Caglar and Bahdanau, Dzmitry and Bougares, Fethi and
                   Schwenk, Holger and Bengio, Yoshua",
  abstract      = "In this paper, we propose a novel neural network model
                   called RNN Encoder-Decoder that consists of two recurrent
                   neural networks (RNN). One RNN encodes a sequence of symbols
                   into a fixed-length vector representation, and the other
                   decodes the representation into another sequence of symbols.
                   The encoder and decoder of the proposed model are jointly
                   trained to maximize the conditional probability of a target
                   sequence given a source sequence. The performance of a
                   statistical machine translation system is empirically found
                   to improve by using the conditional probabilities of phrase
                   pairs computed by the RNN Encoder-Decoder as an additional
                   feature in the existing log-linear model. Qualitatively, we
                   show that the proposed model learns a semantically and
                   syntactically meaningful representation of linguistic
                   phrases.",
  month         =  "3~" # jun,
  year          =  2014,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1406.1078"
}

@ARTICLE{Hermann2015-zi,
  title         = "Teaching Machines to Read and Comprehend",
  author        = "Hermann, Karl Moritz and Ko{\v c}isk{\'y}, Tom{\'a}{\v s}
                   and Grefenstette, Edward and Espeholt, Lasse and Kay, Will
                   and Suleyman, Mustafa and Blunsom, Phil",
  abstract      = "Teaching machines to read natural language documents remains
                   an elusive challenge. Machine reading systems can be tested
                   on their ability to answer questions posed on the contents
                   of documents that they have seen, but until now large scale
                   training and test datasets have been missing for this type
                   of evaluation. In this work we define a new methodology that
                   resolves this bottleneck and provides large scale supervised
                   reading comprehension data. This allows us to develop a
                   class of attention based deep neural networks that learn to
                   read real documents and answer complex questions with
                   minimal prior knowledge of language structure.",
  month         =  "10~" # jun,
  year          =  2015,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1506.03340"
}

@ARTICLE{Morchid2017-yp,
  title         = "Automatic Text Summarization Approaches to Speed up Topic
                   Model Learning Process",
  author        = "Morchid, Mohamed and Torres-Moreno, Juan-Manuel and Dufour,
                   Richard and Ram{\'\i}rez-Rodr{\'\i}guez, Javier and
                   Linar{\`e}s, Georges",
  abstract      = "The number of documents available into Internet moves each
                   day up. For this reason, processing this amount of
                   information effectively and expressibly becomes a major
                   concern for companies and scientists. Methods that represent
                   a textual document by a topic representation are widely used
                   in Information Retrieval (IR) to process big data such as
                   Wikipedia articles. One of the main difficulty in using
                   topic model on huge data collection is related to the
                   material resources (CPU time and memory) required for model
                   estimate. To deal with this issue, we propose to build topic
                   spaces from summarized documents. In this paper, we present
                   a study of topic space representation in the context of big
                   data. The topic space representation behavior is analyzed on
                   different languages. Experiments show that topic spaces
                   estimated from text summaries are as relevant as those
                   estimated from the complete documents. The real advantage of
                   such an approach is the processing time gain: we showed that
                   the processing time can be drastically reduced using
                   summarized documents (more than 60\% in general). This study
                   finally points out the differences between thematic
                   representations of documents depending on the targeted
                   languages such as English or latin languages.",
  month         =  "20~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.IR",
  eprint        = "1703.06630"
}

@ARTICLE{Potok2017-xf,
  title         = "A Study of Complex Deep Learning Networks on High
                   Performance, Neuromorphic, and Quantum Computers",
  author        = "Potok, Thomas E and Schuman, Catherine and Young, Steven R
                   and Patton, Robert M and Spedalieri, Federico and Liu,
                   Jeremy and Yao, Ke-Thia and Rose, Garrett and Chakma,
                   Gangotree",
  abstract      = "Current Deep Learning approaches have been very successful
                   using convolutional neural networks (CNN) trained on large
                   graphical processing units (GPU)-based computers. Three
                   limitations of this approach are: 1) they are based on a
                   simple layered network topology, i.e., highly connected
                   layers, without intra-layer connections; 2) the networks are
                   manually configured to achieve optimal results, and 3) the
                   implementation of neuron model is expensive in both cost and
                   power. In this paper, we evaluate deep learning models using
                   three different computing architectures to address these
                   problems: quantum computing to train complex topologies,
                   high performance computing (HPC) to automatically determine
                   network topology, and neuromorphic computing for a low-power
                   hardware implementation. We use the MNIST dataset for our
                   experiment, due to input size limitations of current quantum
                   computers. Our results show the feasibility of using the
                   three architectures in tandem to address the above deep
                   learning limitations. We show a quantum computer can find
                   high quality values of intra-layer connections weights, in a
                   tractable time as the complexity of the network increases; a
                   high performance computer can find optimal layer-based
                   topologies; and a neuromorphic computer can represent the
                   complex topology and weights derived from the other
                   architectures in low power memristive hardware.",
  month         =  "15~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1703.05364"
}

@MISC{noauthor_undated-ug,
  title        = "Learning to communicate",
  booktitle    = "{OpenAI}",
  abstract     = "In this post we'll outline new OpenAI research in which
                  agents develop their own language.",
  howpublished = "\url{https://www.openai.com/blog/learning-to-communicate/}",
  note         = "Accessed: 2017-3-18",
  keywords     = "Latently"
}

@MISC{Tensorflow_undated-bj,
  title        = "tensorflow/models",
  booktitle    = "{GitHub}",
  author       = "{tensorflow}",
  abstract     = "models - Models built with TensorFlow",
  howpublished = "\url{https://github.com/tensorflow/models}",
  note         = "Accessed: 2017-3-17",
  keywords     = "Latently"
}

@MISC{noauthor_undated-ge,
  title        = "Building Safe {A.I}. - i am trask",
  abstract     = "A machine learning craftsmanship blog.",
  howpublished = "\url{https://iamtrask.github.io/2017/03/17/safe-ai/}",
  note         = "Accessed: 2017-3-17",
  keywords     = "Latently"
}

@ARTICLE{Jang2017-zs,
  title         = "Improving Document Clustering by Eliminating Unnatural
                   Language",
  author        = "Jang, Myunga and Choi, Jinho D and Allan, James",
  abstract      = "Technical documents contain a fair amount of unnatural
                   language, such as tables, formulas, pseudo-codes, etc.
                   Unnatural language can be an important factor of confusing
                   existing NLP tools. This paper presents an effective method
                   of distinguishing unnatural language from natural language,
                   and evaluates the impact of unnatural language detection on
                   NLP tasks such as document clustering. We view this problem
                   as an information extraction task and build a multiclass
                   classification model identifying unnatural language
                   components into four categories. First, we create a new
                   annotated corpus by collecting slides and papers in various
                   formats, PPT, PDF, and HTML, where unnatural language
                   components are annotated into four categories. We then
                   explore features available from plain text to build a
                   statistical model that can handle any format as long as it
                   is converted into plain text. Our experiments show that
                   removing unnatural language components gives an absolute
                   improvement in document clustering up to 15\%. Our corpus
                   and tool are publicly available.",
  month         =  "16~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.IR",
  eprint        = "1703.05706"
}

@ARTICLE{Do2017-kn,
  title         = "Legal Question Answering using Ranking {SVM} and Deep
                   Convolutional Neural Network",
  author        = "Do, Phong-Khac and Nguyen, Huy-Tien and Tran, Chien-Xuan and
                   Nguyen, Minh-Tien and Nguyen, Minh-Le",
  abstract      = "This paper presents a study of employing Ranking SVM and
                   Convolutional Neural Network for two missions: legal
                   information retrieval and question answering in the
                   Competition on Legal Information Extraction/Entailment. For
                   the first task, our proposed model used a triple of features
                   (LSI, Manhattan, Jaccard), and is based on paragraph level
                   instead of article level as in previous studies. In fact,
                   each single-paragraph article corresponds to a particular
                   paragraph in a huge multiple-paragraph article. For the
                   legal question answering task, additional statistical
                   features from information retrieval task integrated into
                   Convolutional Neural Network contribute to higher accuracy.",
  month         =  "16~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.05320"
}

@MISC{Google_undated-bw,
  title        = "An Upgrade to {SyntaxNet}, New Models and a Parsing
                  Competition",
  booktitle    = "Research Blog",
  author       = "{Google}",
  abstract     = "Posted by David Weiss and Slav Petrov, Research Scientists At
                  Google, we continuously improve the language understanding
                  capabilities used...",
  howpublished = "\url{https://research.googleblog.com/2017/03/an-upgrade-to-syntaxnet-new-models-and.html}",
  note         = "Accessed: 2017-3-16",
  keywords     = "Latently"
}

@MISC{noauthor_undated-zv,
  title        = "Learning when to skim and when to read",
  abstract     = "Do we always need human level accuracy in real world data? Or
                  can we sometimes do with less? In this blog post we will
                  explore how a fast baseline can decide which sentences are
                  easy or difficult. By only using expensive classifiers on the
                  difficult sentences we can save computational time.",
  howpublished = "\url{https://metamind.io/research/learning-when-to-skim-and-when-to-read}",
  note         = "Accessed: 2017-3-15",
  keywords     = "Latently"
}

@ARTICLE{Sennrich2017-jn,
  title         = "Nematus: a Toolkit for Neural Machine Translation",
  author        = "Sennrich, Rico and Firat, Orhan and Cho, Kyunghyun and
                   Birch, Alexandra and Haddow, Barry and Hitschler, Julian and
                   Junczys-Dowmunt, Marcin and L{\"a}ubli, Samuel and Barone,
                   Antonio Valerio Miceli and Mokry, Jozef and N{\u a}dejde,
                   Maria",
  abstract      = "We present Nematus, a toolkit for Neural Machine
                   Translation. The toolkit prioritizes high translation
                   accuracy, usability, and extensibility. Nematus has been
                   used to build top-performing submissions to shared
                   translation tasks at WMT and IWSLT, and has been used to
                   train systems for production environments.",
  month         =  "13~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.04357"
}

@ARTICLE{Ross2017-rh,
  title         = "Right for the Right Reasons: Training Differentiable Models
                   by Constraining their Explanations",
  author        = "Ross, Andrew Slavin and Hughes, Michael C and Doshi-Velez,
                   Finale",
  abstract      = "Neural networks are among the most accurate supervised
                   learning methods in use today, but their opacity makes them
                   difficult to trust in critical applications, especially when
                   conditions in training differ from those in test. Recent
                   work on explanations for black-box models has produced tools
                   (e.g. LIME) to show the implicit rules behind predictions,
                   which can help us identify when models are right for the
                   wrong reasons. However, these methods do not scale to
                   explaining entire datasets and cannot correct the problems
                   they reveal. We introduce a method for efficiently
                   explaining and regularizing differentiable models by
                   examining and selectively penalizing their input gradients,
                   which provide a normal to the decision boundary. We apply
                   these penalties both based on expert annotation and in an
                   unsupervised fashion that encourages diverse models with
                   qualitatively different decision boundaries for the same
                   classification problem. On multiple datasets, we show our
                   approach generates faithful explanations and models that
                   generalize much better when conditions differ between
                   training and test.",
  month         =  "10~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1703.03717"
}

@INPROCEEDINGS{Lai2015-vb,
  title     = "Recurrent Convolutional Neural Networks for Text Classification",
  booktitle = "{AAAI}",
  author    = "Lai, Siwei and Xu, Liheng and Liu, Kang and Zhao, Jun",
  volume    =  333,
  pages     = "2267--2273",
  year      =  2015,
  keywords  = "Latently"
}

@ARTICLE{Fulda2017-su,
  title         = "What can you do with a rock? Affordance extraction via word
                   embeddings",
  author        = "Fulda, Nancy and Ricks, Daniel and Murdoch, Ben and Wingate,
                   David",
  abstract      = "Autonomous agents must often detect affordances: the set of
                   behaviors enabled by a situation. Affordance detection is
                   particularly helpful in domains with large action spaces,
                   allowing the agent to prune its search space by avoiding
                   futile behaviors. This paper presents a method for
                   affordance extraction via word embeddings trained on a
                   Wikipedia corpus. The resulting word vectors are treated as
                   a common knowledge database which can be queried using
                   linear algebra. We apply this method to a reinforcement
                   learning agent in a text-only environment and show that
                   affordance-based action selection improves performance most
                   of the time. Our method increases the computational
                   complexity of each learning step but significantly reduces
                   the total number of steps needed. In addition, the agent's
                   action selections begin to resemble those a human would
                   choose.",
  month         =  "9~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.AI",
  eprint        = "1703.03429"
}

@ARTICLE{Pritzel2017-dg,
  title         = "Neural Episodic Control",
  author        = "Pritzel, Alexander and Uria, Benigno and Srinivasan, Sriram
                   and Puigdom{\`e}nech, Adri{\`a} and Vinyals, Oriol and
                   Hassabis, Demis and Wierstra, Daan and Blundell, Charles",
  abstract      = "Deep reinforcement learning methods attain super-human
                   performance in a wide range of environments. Such methods
                   are grossly inefficient, often taking orders of magnitudes
                   more data than humans to achieve reasonable performance. We
                   propose Neural Episodic Control: a deep reinforcement
                   learning agent that is able to rapidly assimilate new
                   experiences and act upon them. Our agent uses a semi-tabular
                   representation of the value function: a buffer of past
                   experience containing slowly changing state representations
                   and rapidly updated estimates of the value function. We show
                   across a wide range of environments that our agent learns
                   significantly faster than other state-of-the-art, general
                   purpose deep reinforcement learning agents.",
  month         =  "6~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1703.01988"
}

@ARTICLE{Augenstein2017-bw,
  title         = "Generalisation in Named Entity Recognition: A Quantitative
                   Analysis",
  author        = "Augenstein, Isabelle and Derczynski, Leon and Bontcheva,
                   Kalina",
  abstract      = "Named Entity Recognition (NER) is a key NLP task, which is
                   all the more challenging on Web and user-generated content
                   with their diverse and continuously changing language. This
                   paper aims to quantify how this diversity impacts
                   state-of-the-art NER methods, by measuring named entity (NE)
                   and context variability, feature sparsity, and their effects
                   on precision and recall. In particular, our findings
                   indicate that NER approaches struggle to generalise in
                   diverse genres with limited training data. Unseen NEs, in
                   particular, play an important role, which have a higher
                   incidence in diverse genres such as social media than in
                   more regular genres such as newswire. Coupled with a higher
                   incidence of unseen features more generally and the lack of
                   large training corpora, this leads to significantly lower F1
                   scores for diverse genres as compared to more regular ones.
                   We also find that leading systems rely heavily on surface
                   forms found in training data, having problems generalising
                   beyond these, and offer explanations for this observation.",
  month         =  "11~" # jan,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1701.02877"
}

@ARTICLE{noauthor_undated-ym,
  title    = "Linguistic Knowledge as Memory for Recurrent Neural Networks",
  author   = "Dhingra, Bhuwan and Yang, Zhilin and Cohen, William W and
              Salakhutdinov, Ruslan",
  abstract = "Training recurrent neural networks to model long term
              dependencies is difficult. Hence, we propose to use external
              linguistic knowledge as an explicit signal to inform the model
              which memories it should utilize. Specifically, external
              knowledge is used to augment a sequence with typed edges between
              arbitrarily distant elements, and the resulting graph is
              decomposed into directed acyclic subgraphs. We introduce a model
              that encodes such graphs as explicit memory in recurrent neural
              networks, and use it to model coreference relations in text. We
              apply our model to several text comprehension tasks and achieve
              new state-of-the-art results on all considered benchmarks,
              including CNN, bAbi, and LAMBADA. On the bAbi QA tasks, our model
              solves 15 out of the 20 tasks with only 1000 training examples
              per task. Analysis of the learned representations further
              demonstrates the ability of our model to encode fine-grained
              entity information across a document.",
  journal  = "arXiv [cs.CL]",
  month    =  "7~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{noauthor_undated-ek,
  title    = "Stopping {GAN} Violence: Generative Unadversarial Networks",
  author   = "Albanie, Samuel and Ehrhardt, S{\'e}bastien and Henriques,
              Jo{\~a}o F",
  abstract = "While the costs of human violence have attracted a great deal of
              attention from the research community, the effects of the
              network-on-network (NoN) violence popularised by Generative
              Adversarial Networks have yet to be addressed. In this work, we
              quantify the financial, social, spiritual, cultural, grammatical
              and dermatological impact of this aggression and address the
              issue by proposing a more peaceful approach which we term
              Generative Unadversarial Networks (GUNs). Under this framework,
              we simultaneously train two models: a generator G that does its
              best to capture whichever data distribution it feels it can
              manage, and a motivator M that helps G to achieve its dream.
              Fighting is strictly verboten and both models evolve by learning
              to respect their differences. The framework is both theoretically
              and electrically grounded in game theory, and can be viewed as a
              winner-shares-all two-player game in which both players work as a
              team to achieve the best score. Experiments show that by working
              in harmony, the proposed model is able to claim both the moral
              and log-likelihood high ground. Our work builds on a rich history
              of carefully argued position-papers, published as anonymous
              YouTube comments, which prove that the optimal solution to NoN
              violence is more GUNs.",
  journal  = "arXiv [stat.ML]",
  month    =  "7~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Ostmeyer2017-dx,
  title         = "Machine Learning on Sequential Data Using a Recurrent
                   Weighted Average",
  author        = "Ostmeyer, Jared and Cowell, Lindsay",
  abstract      = "Recurrent Neural Networks (RNN) are a type of statistical
                   model designed to handle sequential data. The model reads a
                   sequence one symbol at a time. Each symbol is processed
                   based on information collected from the previous symbols.
                   With existing RNN architectures, each symbol is processed
                   using only information from the previous processing step. To
                   overcome this limitation, we propose a new kind of RNN model
                   that computes a recurrent weighted average (RWA) over every
                   past processing step. Because the RWA can be computed as a
                   running average, the computational overhead scales like that
                   of any other RNN. The approach essentially reformulates the
                   attention mechanism into a stand-alone model. When assessing
                   a RWA model, it is found to train faster and generalize
                   better than a standard LSTM model when performing the
                   variable copy problem, the adding problem, classification of
                   artificial grammar, classification of sequences by length,
                   and classification of MNIST handwritten digits (where the
                   pixels are read sequentially one at a time).",
  month         =  "3~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1703.01253"
}

@MISC{Google_undated-eq,
  title        = "{XLA} - {TensorFlow}, compiled",
  booktitle    = "Google Developers Blog",
  author       = "{Google}",
  abstract     = "News and insights on Google platforms, tools, and events.",
  howpublished = "\url{https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html}",
  note         = "Accessed: 2017-3-7",
  keywords     = "Latently"
}

@ARTICLE{noauthor_undated-rg,
  title    = "{Large-Scale} Evolution of Image Classifiers",
  author   = "Real, Esteban and Moore, Sherry and Selle, Andrew and Saxena,
              Saurabh and Suematsu, Yutaka Leon and Le, Quoc and Kurakin, Alex",
  abstract = "Neural networks have proven effective at solving difficult
              problems but designing their architectures can be challenging,
              even for image classification problems alone. Evolutionary
              algorithms provide a technique to discover such networks
              automatically. Despite significant computational requirements, we
              show that evolving models that rival large, hand-designed
              architectures is possible today. We employ simple evolutionary
              techniques at unprecedented scales to discover models for the
              CIFAR-10 and CIFAR-100 datasets, starting from trivial initial
              conditions. To do this, we use novel and intuitive mutation
              operators that navigate large search spaces. We stress that no
              human participation is required once evolution starts and that
              the output is a fully-trained model. Throughout this work, we
              place special emphasis on the repeatability of results, the
              variability in the outcomes and the computational requirements.",
  journal  = "arXiv [cs.NE]",
  month    =  "3~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{noauthor_undated-bn,
  title    = "Do Deep Convolutional Nets Really Need to be Deep and
              Convolutional?",
  author   = "Urban, Gregor and Geras, Krzysztof J and Kahou, Samira Ebrahimi
              and Aslan, Ozlem and Wang, Shengjie and Caruana, Rich and
              Mohamed, Abdelrahman and Philipose, Matthai and Richardson, Matt",
  abstract = "Yes, they do. This paper provides the first empirical
              demonstration that deep convolutional models really need to be
              both deep and convolutional, even when trained with methods such
              as distillation that allow small or shallow models of high
              accuracy to be trained. Although previous research showed that
              shallow feed-forward nets sometimes can learn the complex
              functions previously learned by deep nets while using the same
              number of parameters as the deep models they mimic, in this paper
              we demonstrate that the same methods cannot be used to train
              accurate models on CIFAR-10 unless the student models contain
              multiple layers of convolution. Although the student models do
              not have to be as deep as the teacher model they mimic, the
              students need multiple convolutional layers to learn functions of
              comparable accuracy as the deep convolutional teacher.",
  journal  = "arXiv [stat.ML]",
  month    =  "17~" # mar,
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{Lazaridou2016-tj,
  title         = "{Multi-Agent} Cooperation and the Emergence of (Natural)
                   Language",
  author        = "Lazaridou, Angeliki and Peysakhovich, Alexander and Baroni,
                   Marco",
  abstract      = "The current mainstream approach to train natural language
                   systems is to expose them to large amounts of text. This
                   passive learning is problematic if we are interested in
                   developing interactive machines, such as conversational
                   agents. We propose a framework for language learning that
                   relies on multi-agent communication. We study this learning
                   in the context of referential games. In these games, a
                   sender and a receiver see a pair of images. The sender is
                   told one of them is the target and is allowed to send a
                   message from a fixed, arbitrary vocabulary to the receiver.
                   The receiver must rely on this message to identify the
                   target. Thus, the agents develop their own language
                   interactively out of the need to communicate. We show that
                   two networks with simple configurations are able to learn to
                   coordinate in the referential game. We further explore how
                   to make changes to the game environment to cause the ``word
                   meanings'' induced in the game to better reflect intuitive
                   semantic properties of the images. In addition, we present a
                   simple strategy for grounding the agents' code into natural
                   language. Both of these are necessary steps towards
                   developing machines that are able to communicate with humans
                   productively.",
  month         =  "21~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.07182"
}

@ARTICLE{noauthor_undated-ym,
  title    = "A Novel Comprehensive Approach for Estimating Concept Semantic
              Similarity in {WordNet}",
  author   = "Zhang, Xiao-Gang and Sun, Shou-Qian and Zhang, Ke-Jun",
  abstract = "Computation of semantic similarity between concepts is an
              important foundation for many research works. This paper focuses
              on IC computing methods and IC measures, which estimate the
              semantic similarities between concepts by exploiting the
              topological parameters of the taxonomy. Based on analyzing
              representative IC computing methods and typical semantic
              similarity measures, we propose a new hybrid IC computing method.
              Through adopting the parameter dhyp and lch, we utilize the new
              IC computing method and propose a novel comprehensive measure of
              semantic similarity between concepts. An experiment based on
              WordNet ``is a'' taxonomy has been designed to test
              representative measures and our measure on benchmark dataset
              R\&G, and the results show that our measure can obviously improve
              the similarity accuracy. We evaluate the proposed approach by
              comparing the correlation coefficients between five measures and
              the artificial data. The results show that our proposal
              outperforms the previous measures.",
  journal  = "arXiv [cs.CL]",
  month    =  "6~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Neubig2017-bc,
  title         = "Neural Machine Translation and Sequence-to-sequence Models:
                   A Tutorial",
  author        = "Neubig, Graham",
  abstract      = "This tutorial introduces a new and powerful set of
                   techniques variously called ``neural machine translation''
                   or ``neural sequence-to-sequence models''. These techniques
                   have been used in a number of tasks regarding the handling
                   of human language, and can be a powerful tool in the toolbox
                   of anyone who wants to model sequential data of some sort.
                   The tutorial assumes that the reader knows the basics of
                   math and programming, but does not assume any particular
                   experience with neural networks or natural language
                   processing. It attempts to explain the intuition behind the
                   various methods covered, then delves into them with enough
                   mathematical detail to understand them concretely, and
                   culiminates with a suggestion for an implementation
                   exercise, where readers can test that they understood the
                   content in practice.",
  month         =  "5~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.01619"
}

@ARTICLE{Murugesan2017-pr,
  title         = "{Self-Paced} Multitask Learning with Shared Knowledge",
  author        = "Murugesan, Keerthiram and Carbonell, Jaime",
  abstract      = "This paper introduces self-paced task selection to multitask
                   learning, where instances from more closely related tasks
                   are selected in a progression of easier-to-harder tasks, to
                   emulate an effective human education strategy but applied to
                   multitask machine learning. We develop the mathematical
                   foundation for the approach based on an iterative selection
                   of the most appropriate task, learning the task parameters,
                   and updating the shared knowledge, optimizing a new
                   bi-convex loss function. This proposed method applies quite
                   generally, including to multitask feature learning,
                   multitask learning with alternating structure optimization
                   and multitask manifold regularization learning. Results show
                   that in each of the above formulations self-paced
                   (easier-to-harder) task selection outperforms the baseline
                   version of these methods in all the experiments.",
  month         =  "2~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1703.00977"
}

@ARTICLE{Dhingra2017-ow,
  title         = "A Comparative Study of Word Embeddings for Reading
                   Comprehension",
  author        = "Dhingra, Bhuwan and Liu, Hanxiao and Salakhutdinov, Ruslan
                   and Cohen, William W",
  abstract      = "The focus of past machine learning research for Reading
                   Comprehension tasks has been primarily on the design of
                   novel deep learning architectures. Here we show that
                   seemingly minor choices made on (1) the use of pre-trained
                   word embeddings, and (2) the representation of
                   out-of-vocabulary tokens at test time, can turn out to have
                   a larger impact than architectural choices on the final
                   performance. We systematically explore several options for
                   these choices, and provide recommendations to researchers
                   working in this area.",
  month         =  "2~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.00993"
}

@ARTICLE{noauthor_undated-fu,
  title    = "{DAWT}: Densely Annotated Wikipedia Texts across multiple
              languages",
  author   = "Spasojevic, Nemanja and Bhargava, Preeti and Hu, Guoning",
  abstract = "In this work, we open up the DAWT dataset - Densely Annotated
              Wikipedia Texts across multiple languages. The annotations
              include labeled text mentions mapping to entities (represented by
              their Freebase machine ids) as well as the type of the entity.
              The data set contains total of 13.6M articles, 5.0B tokens, 13.8M
              mention entity co-occurrences. DAWT contains 4.8 times more
              anchor text to entity links than originally present in the
              Wikipedia markup. Moreover, it spans several languages including
              English, Spanish, Italian, German, French and Arabic. We also
              present the methodology used to generate the dataset which
              enriches Wikipedia markup in order to increase number of links.
              In addition to the main dataset, we open up several derived
              datasets including mention entity co-occurrence counts and entity
              embeddings, as well as mappings between Freebase ids and Wikidata
              item ids. We also discuss two applications of these datasets and
              hope that opening them up would prove useful for the Natural
              Language Processing and Information Retrieval communities, as
              well as facilitate multi-lingual research.",
  journal  = "arXiv [cs.IR]",
  month    =  "2~" # mar,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Chollet2016-yi,
  title         = "Xception: Deep Learning with Depthwise Separable
                   Convolutions",
  author        = "Chollet, Fran{\c c}ois",
  abstract      = "We present an interpretation of Inception modules in
                   convolutional neural networks as being an intermediate step
                   in-between regular convolution and the
                   \textbackslashtextit\{depthwise separable convolution\}
                   operation (a depthwise convolution followed by a pointwise
                   convolution). In this light, a depthwise separable
                   convolution can be understood as an Inception module with a
                   maximally large number of towers. This observation leads us
                   to propose a novel deep convolutional neural network
                   architecture inspired by Inception, where Inception modules
                   have been replaced with depthwise separable convolutions. We
                   show that this architecture, dubbed Xception, slightly
                   outperforms Inception V3 on the ImageNet dataset (which
                   Inception V3 was designed for), and significantly
                   outperforms Inception V3 on a larger image classification
                   dataset comprising 350 million images and 17,000 classes.
                   Since the Xception architecture has the same number of
                   parameter as Inception V3, the performance gains are not due
                   to increased capacity but rather to a more efficient use of
                   model parameters.",
  month         =  "7~" # oct,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CV",
  eprint        = "1610.02357"
}

@ARTICLE{Sun2017-ab,
  title         = "{Lock-Free} Parallel Perceptron for Graph-based Dependency
                   Parsing",
  author        = "Sun, Xu and Ma, Shuming",
  abstract      = "Dependency parsing is an important NLP task. A popular
                   approach for dependency parsing is structured perceptron.
                   Still, graph-based dependency parsing has the time
                   complexity of $O(n^3)$, and it suffers from slow training.
                   To deal with this problem, we propose a parallel algorithm
                   called parallel perceptron. The parallel algorithm can make
                   full use of a multi-core computer which saves a lot of
                   training time. Based on experiments we observe that
                   dependency parsing with parallel perceptron can achieve
                   8-fold faster training speed than traditional structured
                   perceptron methods when using 10 threads, and with no loss
                   at all in accuracy.",
  month         =  "2~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.00782"
}

@ARTICLE{Yao2017-dl,
  title         = "Discovery of Evolving Semantics through Dynamic Word
                   Embedding Learning",
  author        = "Yao, Zijun and Sun, Yifan and Ding, Weicong and Rao, Nikhil
                   and Xiong, Hui",
  abstract      = "During the course of human language evolution, the semantic
                   meanings of words keep evolving with time. The understanding
                   of evolving semantics enables us to capture the true meaning
                   of the words in different usage contexts, and thus is
                   critical for various applications, such as machine
                   translation. While it is naturally promising to study word
                   semantics in a time-aware manner, traditional methods to
                   learn word vector representation do not adequately capture
                   the change over time. To this end, in this paper, we aim at
                   learning time-aware vector representation of words through
                   dynamic word embedding modeling. Specifically, we first
                   propose a method that captures time-specific semantics and
                   across-time alignment simultaneously in a way that is robust
                   to data sparsity. Then, we solve the resulting optimization
                   problem using a scalable coordinate descent method. Finally,
                   we perform the empirical study on New York Times data to
                   learn the temporal embeddings and develop multiple
                   evaluations that illustrate the semantic evolution of words,
                   discovered from news media. Moreover, our qualitative and
                   quantitative tests indicate that the our method not only
                   reliably captures the semantic evolution over time, but also
                   onsistently outperforms state-of-the-art temporal embedding
                   approaches on both semantic accuracy and alignment quality.",
  month         =  "2~" # mar,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1703.00607"
}

@ARTICLE{Kaminskas2017-sj,
  title     = "Diversity, Serendipity, Novelty, and Coverage: A Survey and
               Empirical Analysis of {Beyond-Accuracy} Objectives in
               Recommender Systems",
  author    = "Kaminskas, Marius and Bridge, Derek",
  journal   = "ACM Transactions on Interactive Intelligent Systems (TiiS)",
  publisher = "ACM",
  volume    =  7,
  number    =  1,
  pages     = "2",
  month     =  "2~" # jan,
  year      =  2017,
  keywords  = "Evaluation metrics; beyond accuracy; coverage; diversity;
               novelty; serendipity;Latently"
}

@ARTICLE{Speer2016-aa,
  title         = "An Ensemble Method to Produce {High-Quality} Word Embeddings",
  author        = "Speer, Robert and Chin, Joshua",
  abstract      = "A currently successful approach to computational semantics
                   is to represent words as embeddings in a machine-learned
                   vector space. We present an ensemble method that combines
                   embeddings produced by GloVe (Pennington et al., 2014) and
                   word2vec (Mikolov et al., 2013) with structured knowledge
                   from the semantic networks ConceptNet (Speer and Havasi,
                   2012) and PPDB (Ganitkevitch et al., 2013), merging their
                   information into a common representation with a large,
                   multilingual vocabulary. The embeddings it produces achieve
                   state-of-the-art performance on many word-similarity
                   evaluations. Its score of $\rho = .596$ on an evaluation of
                   rare words (Luong et al., 2013) is 16\% higher than the
                   previous best known system.",
  month         =  "6~" # apr,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1604.01692"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{Facebookresearch_undated-jl,
  title        = "fastText/pretrained-vectors.md at master ·
                  facebookresearch/fastText",
  author       = "{facebookresearch}",
  abstract     = "fastText - Library for fast text representation and
                  classification.",
  howpublished = "\url{https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md}",
  note         = "Accessed: 2017-2-28",
  keywords     = "Latently"
}

@ARTICLE{Wang2017-qn,
  title         = "On the Origin of Deep Learning",
  author        = "Wang, Haohan and Raj, Bhiksha and Xing, Eric P",
  abstract      = "This paper is a review of the evolutionary history of deep
                   learning models. It covers from the genesis of neural
                   networks when associationism modeling of the brain is
                   studied, to the models that dominate the last decade of
                   research in deep learning like convolutional neural
                   networks, deep belief networks, and recurrent neural
                   networks, and extends to popular recent models like
                   variational autoencoder and generative adversarial nets. In
                   addition to a review of these models, this paper primarily
                   focuses on the precedents of the models above, examining how
                   the initial ideas are assembled to construct the early
                   models and how these preliminary models are developed into
                   their current forms. Many of these evolutionary paths last
                   more than half a century and have a diversity of directions.
                   For example, CNN is built on prior knowledge of biological
                   vision system; DBN is evolved from a trade-off of modeling
                   power and computation complexity of graphical models and
                   many nowadays models are neural counterparts of ancient
                   linear models. This paper reviews these evolutionary paths
                   and offers a concise thought flow of how these models are
                   developed, and aims to provide a thorough background for
                   deep learning. More importantly, along with the path, this
                   paper summarizes the gist behind these milestones and
                   proposes many directions to guide the future research of
                   deep learning.",
  month         =  "24~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1702.07800"
}

@ARTICLE{Arik2017-jk,
  title         = "Deep Voice: Real-time Neural {Text-to-Speech}",
  author        = "Arik, Sercan O and Chrzanowski, Mike and Coates, Adam and
                   Diamos, Gregory and Gibiansky, Andrew and Kang, Yongguo and
                   Li, Xian and Miller, John and Raiman, Jonathan and Sengupta,
                   Shubho and Shoeybi, Mohammad",
  abstract      = "We present Deep Voice, a production-quality text-to-speech
                   system constructed entirely from deep neural networks. Deep
                   Voice lays the groundwork for truly end-to-end neural speech
                   synthesis. The system comprises five major building blocks:
                   a segmentation model for locating phoneme boundaries, a
                   grapheme-to-phoneme conversion model, a phoneme duration
                   prediction model, a fundamental frequency prediction model,
                   and an audio synthesis model. For the segmentation model, we
                   propose a novel way of performing phoneme boundary detection
                   with deep neural networks using connectionist temporal
                   classification (CTC) loss. For the audio synthesis model, we
                   implement a variant of WaveNet that requires fewer
                   parameters and trains faster than the original. By using a
                   neural network for each component, our system is simpler and
                   more flexible than traditional text-to-speech systems, where
                   each component requires laborious feature engineering and
                   extensive domain expertise. Finally, we show that inference
                   with our system can be performed faster than real time and
                   describe optimized WaveNet inference kernels on both CPU and
                   GPU that achieve up to 400x speedups over existing
                   implementations.",
  month         =  "25~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.07825"
}

@ARTICLE{De_Arruda2016-bb,
  title         = "Representation of texts as complex networks: a mesoscopic
                   approach",
  author        = "de Arruda, Henrique F and Silva, Filipi N and Marinho,
                   Vanessa Q and Amancio, Diego R and da F. Costa, Luciano",
  abstract      = "Statistical techniques that analyze texts, referred to as
                   text analytics, have departed from the use of simple word
                   count statistics towards a new paradigm. Text mining now
                   hinges on a more sophisticated set of methods, including the
                   representations in terms of complex networks. While
                   well-established word-adjacency (co-occurrence) methods
                   successfully grasp syntactical features of written texts,
                   they are unable to represent important aspects of textual
                   data, such as its topical structure, i.e. the sequence of
                   subjects developing at a mesoscopic level along the text.
                   Such aspects are often overlooked by current methodologies.
                   In order to grasp the mesoscopic characteristics of
                   semantical content in written texts, we devised a network
                   model which is able to analyze documents in a multi-scale
                   fashion. In the proposed model, a limited amount of adjacent
                   paragraphs are represented as nodes, which are connected
                   whenever they share a minimum semantical content. To
                   illustrate the capabilities of our model, we present, as a
                   case example, a qualitative analysis of ``Alice's Adventures
                   in Wonderland''. We show that the mesoscopic structure of a
                   document, modeled as a network, reveals many semantic traits
                   of texts. Such an approach paves the way to a myriad of
                   semantic-based applications. In addition, our approach is
                   illustrated in a machine learning context, in which texts
                   are classified among real texts and randomized instances.",
  month         =  "30~" # jun,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.09636"
}

@ARTICLE{Seo2016-lr,
  title         = "{Query-Reduction} Networks for Question Answering",
  author        = "Seo, Minjoon and Min, Sewon and Farhadi, Ali and Hajishirzi,
                   Hannaneh",
  abstract      = "In this paper, we study the problem of question answering
                   when reasoning over multiple facts is required. We propose
                   Query-Reduction Network (QRN), a variant of Recurrent Neural
                   Network (RNN) that effectively handles both short-term
                   (local) and long-term (global) sequential dependencies to
                   reason over multiple facts. QRN considers the context
                   sentences as a sequence of state-changing triggers, and
                   reduces the original query to a more informed query as it
                   observes each trigger (context sentence) through time. Our
                   experiments show that QRN produces the state-of-the-art
                   results in bAbI QA and dialog tasks, and in a real
                   goal-oriented dialog dataset. In addition, QRN formulation
                   allows parallelization on RNN's time axis, saving an order
                   of magnitude in time complexity for training and inference.",
  month         =  "14~" # jun,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.04582"
}

@ARTICLE{Etzioni2006-hk,
  title     = "Machine Reading",
  author    = "Etzioni, O and Banko, M and Cafarella, M J",
  abstract  = "The time is ripe for the AI community to set its sights on
               Machine Reading---the autonomous understanding of text. Below,
               we place the notion of ``Machine Reading'' in context, describe
               progress towards this goal by the KnowItAll research group at
               the University of Washington,",
  journal   = "AAAI",
  publisher = "vvvvw.aaai.org",
  year      =  2006,
  keywords  = "Latently"
}

@ARTICLE{Li2017-fa,
  title         = "Deep Reinforcement Learning: An Overview",
  author        = "Li, Yuxi",
  abstract      = "We give an overview of recent exciting achievements of deep
                   reinforcement learning (RL). We start with background of
                   deep learning and reinforcement learning, as well as
                   introduction of testbeds. Next we discuss Deep Q-Network
                   (DQN) and its extensions, asynchronous methods, policy
                   optimization, reward, and planning. After that, we talk
                   about attention and memory, unsupervised learning, and
                   learning to learn. Then we discuss various applications of
                   RL, including games, in particular, AlphaGo, robotics,
                   spoken dialogue systems (a.k.a. chatbot), machine
                   translation, text sequence prediction, neural architecture
                   design, personalized web services, healthcare, finance, and
                   music generation. We mention topics/papers not reviewed yet.
                   After listing a collection of RL resources, we close with
                   discussions.",
  month         =  "25~" # jan,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1701.07274"
}

@ARTICLE{Moosavi2017-da,
  title         = "Use Generalized Representations, But Do Not Forget Surface
                   Features",
  author        = "Moosavi, Nafise Sadat and Strube, Michael",
  abstract      = "Only a year ago, all state-of-the-art coreference resolvers
                   were using an extensive amount of surface features.
                   Recently, there was a paradigm shift towards using word
                   embeddings and deep neural networks, where the use of
                   surface features is very limited. In this paper, we show
                   that a simple SVM model with surface features outperforms
                   more complex neural models for detecting anaphoric mentions.
                   Our analysis suggests that using generalized representations
                   and surface features have different strength that should be
                   both taken into account for improving coreference
                   resolution.",
  month         =  "24~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.07507"
}

@ARTICLE{Sahin2017-kr,
  title         = "Consistent Alignment of Word Embedding Models",
  author        = "Sahin, Cem Safak and Caceres, Rajmonda S and Oselio, Brandon
                   and Campbell, William M",
  abstract      = "Word embedding models offer continuous vector
                   representations that can capture rich contextual semantics
                   based on their word co-occurrence patterns. While these word
                   vectors can provide very effective features used in many NLP
                   tasks such as clustering similar words and inferring
                   learning relationships, many challenges and open research
                   questions remain. In this paper, we propose a solution that
                   aligns variations of the same model (or different models) in
                   a joint low-dimensional latent space leveraging carefully
                   generated synthetic data points. This generative process is
                   inspired by the observation that a variety of linguistic
                   relationships is captured by simple linear operations in
                   embedded space. We demonstrate that our approach can lead to
                   substantial improvements in recovering embeddings of local
                   neighborhoods.",
  month         =  "24~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.07680"
}

@ARTICLE{noauthor_undated-fc,
  title    = "Randomized Iterative Methods for Linear Systems",
  author   = "Gower, Robert M and Richt{\'a}rik, Peter",
  abstract = "We develop a novel, fundamental and surprisingly simple
              randomized iterative method for solving consistent linear
              systems. Our method has six different but equivalent
              interpretations: sketch-and-project, constrain-and-approximate,
              random intersect, random linear solve, random update and random
              fixed point. By varying its two parameters$-$a positive definite
              matrix (defining geometry), and a random matrix (sampled in an
              independently and identically distributed fashion in each
              iteration)$-$we recover a comprehensive array of well-known
              algorithms as special cases, including the randomized Kaczmarz
              method, randomized Newton method, randomized coordinate descent
              method and random Gaussian pursuit. We naturally also obtain
              variants of all these methods using blocks and importance
              sampling. However, our method allows for a much wider selection
              of these two parameters, which leads to a number of new specific
              methods. We prove exponential convergence of the expected norm of
              the error in a single theorem, from which existing complexity
              results for known variants can be obtained. However, we also give
              an exact formula for the evolution of the expected iterates,
              which allows us to give lower bounds on the convergence rate.",
  journal  = "arXiv [math.NA]",
  month    =  "10~" # jun,
  year     =  2015,
  keywords = "Latently"
}

@MISC{Braginsky_undated-hd,
  title        = "Wordbank: An open database of children's vocabulary
                  development",
  author       = "Braginsky, Mika",
  howpublished = "\url{http://wordbank.stanford.edu/}",
  note         = "Accessed: 2017-2-24",
  keywords     = "Latently"
}

@ARTICLE{Papernot2016-do,
  title         = "Semi-supervised Knowledge Transfer for Deep Learning from
                   Private Training Data",
  author        = "Papernot, Nicolas and Abadi, Mart{\'\i}n and Erlingsson,
                   {\'U}lfar and Goodfellow, Ian and Talwar, Kunal",
  abstract      = "Some machine learning applications involve training data
                   that is sensitive, such as the medical histories of patients
                   in a clinical trial. A model may inadvertently and
                   implicitly store some of its training data; careful analysis
                   of the model may therefore reveal sensitive information. To
                   address this problem, we demonstrate a generally applicable
                   approach to providing strong privacy guarantees for training
                   data. The approach combines, in a black-box fashion,
                   multiple models trained with disjoint datasets, such as
                   records from different subsets of users. Because they rely
                   directly on sensitive data, these models are not published,
                   but instead used as ``teachers'' for a ``student'' model.
                   The student learns to predict an output chosen by noisy
                   voting among all of the teachers, and cannot directly access
                   an individual teacher or the underlying data or parameters.
                   The student's privacy properties can be understood both
                   intuitively (since no single teacher and thus no single
                   dataset dictates the student's training) and formally, in
                   terms of differential privacy. These properties hold even if
                   an adversary can not only query the student but also inspect
                   its internal workings. Compared with previous work, the
                   approach imposes only weak assumptions on how teachers are
                   trained: it applies to any model, including non-convex
                   models like DNNs. We achieve state-of-the-art
                   privacy/utility trade-offs on MNIST and SVHN thanks to an
                   improved privacy analysis and semi-supervised learning.",
  month         =  "18~" # oct,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1610.05755"
}

@ARTICLE{noauthor_undated-zx,
  title    = "One Representation per Word - Does it make Sense for Composition?",
  author   = "Kober, Thomas and Weeds, Julie and Wilkie, John and Reffin,
              Jeremy and Weir, David",
  abstract = "In this paper, we investigate whether an a priori disambiguation
              of word senses is strictly necessary or whether the meaning of a
              word in context can be disambiguated through composition alone.
              We evaluate the performance of off-the-shelf single-vector and
              multi-sense vector models on a benchmark phrase similarity task
              and a novel task for word-sense discrimination. We find that
              single-sense vector models perform as well or better than
              multi-sense vector models despite arguably less clean elementary
              representations. Our findings furthermore show that simple
              composition functions such as pointwise addition are able to
              recover sense specific information from a single-sense vector
              model remarkably well.",
  journal  = "arXiv [cs.CL]",
  month    =  "22~" # feb,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Atif_Qureshi2017-ds,
  title         = "{EVE}: Explainable Vector Based Embedding Technique Using
                   Wikipedia",
  author        = "Atif Qureshi, M and Greene, Derek",
  abstract      = "We present an unsupervised explainable word embedding
                   technique, called EVE, which is built upon the structure of
                   Wikipedia. The proposed model defines the dimensions of a
                   semantic vector representing a word using human-readable
                   labels, thereby it readily interpretable. Specifically, each
                   vector is constructed using the Wikipedia category graph
                   structure together with the Wikipedia article link
                   structure. To test the effectiveness of the proposed word
                   embedding model, we consider its usefulness in three
                   fundamental tasks: 1) intruder detection - to evaluate its
                   ability to identify a non-coherent vector from a list of
                   coherent vectors, 2) ability to cluster - to evaluate its
                   tendency to group related vectors together while keeping
                   unrelated vectors in separate clusters, and 3) sorting
                   relevant items first - to evaluate its ability to rank
                   vectors (items) relevant to the query in the top order of
                   the result. For each task, we also propose a strategy to
                   generate a task-specific human-interpretable explanation
                   from the model. These demonstrate the overall effectiveness
                   of the explainable embeddings generated by EVE. Finally, we
                   compare EVE with the Word2Vec, FastText, and GloVe embedding
                   techniques across the three tasks, and report improvements
                   over the state-of-the-art.",
  month         =  "22~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.06891"
}

@ARTICLE{Zolotov2017-if,
  title         = "Analysis and Optimization of fastText Linear Text Classifier",
  author        = "Zolotov, Vladimir and Kung, David",
  abstract      = "The paper [1] shows that simple linear classifier can
                   compete with complex deep learning algorithms in text
                   classification applications. Combining bag of words (BoW)
                   and linear classification techniques, fastText [1] attains
                   same or only slightly lower accuracy than deep learning
                   algorithms [2-9] that are orders of magnitude slower. We
                   proved formally that fastText can be transformed into a
                   simpler equivalent classifier, which unlike fastText does
                   not have any hidden layer. We also proved that the necessary
                   and sufficient dimensionality of the word vector embedding
                   space is exactly the number of document classes. These
                   results help constructing more optimal linear text
                   classifiers with guaranteed maximum classification
                   capabilities. The results are proven exactly by pure formal
                   algebraic methods without attracting any empirical data.",
  month         =  "17~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.05531"
}

@ARTICLE{noauthor_undated-rm,
  title    = "Learning a Natural Language Interface with Neural Programmer",
  author   = "Neelakantan, Arvind and Le, Quoc V and Abadi, Martin and
              McCallum, Andrew and Amodei, Dario",
  abstract = "Learning a natural language interface for database tables is a
              challenging task that involves deep language understanding and
              multi-step reasoning. The task is often approached by mapping
              natural language queries to logical forms or programs that
              provide the desired response when executed on the database. To
              our knowledge, this paper presents the first weakly supervised,
              end-to-end neural network model to induce such programs on a
              real-world dataset. We enhance the objective function of Neural
              Programmer, a neural network with built-in discrete operations,
              and apply it on WikiTableQuestions, a natural language
              question-answering dataset. The model is trained end-to-end with
              weak supervision of question-answer pairs, and does not require
              domain-specific grammars, rules, or annotations that are key
              elements in previous approaches to program induction. The main
              experimental result in this paper is that a single Neural
              Programmer model achieves 34.2\% accuracy using only 10,000
              examples with weak supervision. An ensemble of 15 models, with a
              trivial combination technique, achieves 37.2\% accuracy, which is
              competitive to the current state-of-the-art accuracy of 37.1\%
              obtained by a traditional natural language semantic parser.",
  journal  = "arXiv [cs.CL]",
  month    =  "28~" # nov,
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{Kolesnyk2016-iy,
  title         = "Generating Natural Language Inference Chains",
  author        = "Kolesnyk, Vladyslav and Rockt{\"a}schel, Tim and Riedel,
                   Sebastian",
  abstract      = "The ability to reason with natural language is a fundamental
                   prerequisite for many NLP tasks such as information
                   extraction, machine translation and question answering. To
                   quantify this ability, systems are commonly tested whether
                   they can recognize textual entailment, i.e., whether one
                   sentence can be inferred from another one. However, in most
                   NLP applications only single source sentences instead of
                   sentence pairs are available. Hence, we propose a new task
                   that measures how well a model can generate an entailed
                   sentence from a source sentence. We take entailment-pairs of
                   the Stanford Natural Language Inference corpus and train an
                   LSTM with attention. On a manually annotated test set we
                   found that 82\% of generated sentences are correct, an
                   improvement of 10.3\% over an LSTM baseline. A qualitative
                   analysis shows that this model is not only capable of
                   shortening input sentences, but also inferring new
                   statements via paraphrasing and phrase entailment. We then
                   apply this model recursively to input-output pairs, thereby
                   generating natural language inference chains that can be
                   used to automatically construct an entailment graph from
                   source sentences. Finally, by swapping source and target
                   sentences we can also train a model that given an input
                   sentence invents additional information to generate a new
                   sentence.",
  month         =  "4~" # jun,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.01404"
}

@ARTICLE{noauthor_undated-av,
  title    = "Generative Temporal Models with Memory",
  author   = "Gemici, Mevlana and Hung, Chia-Chun and Santoro, Adam and Wayne,
              Greg and Mohamed, Shakir and Rezende, Danilo J and Amos, David
              and Lillicrap, Timothy",
  abstract = "We consider the general problem of modeling temporal data with
              long-range dependencies, wherein new observations are fully or
              partially predictable based on temporally-distant, past
              observations. A sufficiently powerful temporal model should
              separate predictable elements of the sequence from unpredictable
              elements, express uncertainty about those unpredictable elements,
              and rapidly identify novel elements that may help to predict the
              future. To create such models, we introduce Generative Temporal
              Models augmented with external memory systems. They are developed
              within the variational inference framework, which provides both a
              practical training methodology and methods to gain insight into
              the models' operation. We show, on a range of problems with
              sparse, long-term temporal dependencies, that these models store
              information from early in a sequence, and reuse this stored
              information efficiently. This allows them to perform
              substantially better than existing models based on well-known
              recurrent neural networks, like LSTMs.",
  journal  = "arXiv [cs.LG]",
  month    =  "15~" # feb,
  year     =  2017,
  keywords = "Latently"
}

@MISC{Achilles_undated-um,
  title        = "Bibliography on Automated Text Categorization",
  author       = "Achilles, Alf-Christian and Ortyl, Paul",
  abstract     = "Bibliography on Automated Text Categorization. This
                  bibliography is a part of the Computer Science Bibliography
                  Collection.",
  howpublished = "\url{http://liinwww.ira.uka.de/bibliography/Ai/automated.text.categorization.html}",
  note         = "Accessed: 2017-2-15",
  keywords     = "Latently"
}

@ARTICLE{Sebastiani2001-yp,
  title         = "Machine Learning in Automated Text Categorization",
  author        = "Sebastiani, Fabrizio",
  abstract      = "The automated categorization (or classification) of texts
                   into predefined categories has witnessed a booming interest
                   in the last ten years, due to the increased availability of
                   documents in digital form and the ensuing need to organize
                   them. In the research community the dominant approach to
                   this problem is based on machine learning techniques: a
                   general inductive process automatically builds a classifier
                   by learning, from a set of preclassified documents, the
                   characteristics of the categories. The advantages of this
                   approach over the knowledge engineering approach (consisting
                   in the manual definition of a classifier by domain experts)
                   are a very good effectiveness, considerable savings in terms
                   of expert manpower, and straightforward portability to
                   different domains. This survey discusses the main approaches
                   to text categorization that fall within the machine learning
                   paradigm. We will discuss in detail issues pertaining to
                   three different problems, namely document representation,
                   classifier construction, and classifier evaluation.",
  month         =  "26~" # oct,
  year          =  2001,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.IR",
  eprint        = "cs/0110053"
}

@INPROCEEDINGS{Pennington2014-zq,
  title     = "Glove: Global Vectors for Word Representation",
  booktitle = "{EMNLP}",
  author    = "Pennington, Jeffrey and Socher, Richard and Manning, Christopher
               D",
  volume    =  14,
  pages     = "1532--1543",
  year      =  2014,
  keywords  = "Latently"
}

@MISC{Zaharia_undated-hk,
  title        = "The Datacenter Needs an Operating System",
  booktitle    = "{AMPLab} - {UC} Berkeley",
  author       = "Zaharia, Matei and Hindman, Ben and Konwinski, Andy and
                  Ghodsi, Ali and Joseph, Anthony and Katz, Randy and Shenker,
                  Scott and Stoica, Ion",
  howpublished = "\url{https://amplab.cs.berkeley.edu/publication/the-datacenter-needs-an-operating-system/}",
  note         = "Accessed: 2017-2-15",
  keywords     = "Latently"
}

@ARTICLE{Poggio2016-mw,
  title         = "Why and When Can Deep -- but Not Shallow -- Networks Avoid
                   the Curse of Dimensionality: a Review",
  author        = "Poggio, Tomaso and Mhaskar, Hrushikesh and Rosasco, Lorenzo
                   and Miranda, Brando and Liao, Qianli",
  abstract      = "The paper characterizes classes of functions for which deep
                   learning can be exponentially better than shallow learning.
                   Deep convolutional networks are a special case of these
                   conditions, though weight sharing is not the main reason for
                   their exponential advantage.",
  month         =  "2~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1611.00740"
}

@ARTICLE{Poggio2016-cm,
  title         = "Why and When Can Deep -- but Not Shallow -- Networks Avoid
                   the Curse of Dimensionality: a Review",
  author        = "Poggio, Tomaso and Mhaskar, Hrushikesh and Rosasco, Lorenzo
                   and Miranda, Brando and Liao, Qianli",
  abstract      = "The paper characterizes classes of functions for which deep
                   learning can be exponentially better than shallow learning.
                   Deep convolutional networks are a special case of these
                   conditions, though weight sharing is not the main reason for
                   their exponential advantage.",
  month         =  "2~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1611.00740"
}

@ARTICLE{Wang2016-or,
  title         = "Learning to reinforcement learn",
  author        = "Wang, Jane X and Kurth-Nelson, Zeb and Tirumala, Dhruva and
                   Soyer, Hubert and Leibo, Joel Z and Munos, Remi and
                   Blundell, Charles and Kumaran, Dharshan and Botvinick, Matt",
  abstract      = "In recent years deep reinforcement learning (RL) systems
                   have attained superhuman performance in a number of
                   challenging task domains. However, a major limitation of
                   such applications is their demand for massive amounts of
                   training data. A critical present objective is thus to
                   develop deep RL methods that can adapt rapidly to new tasks.
                   In the present work we introduce a novel approach to this
                   challenge, which we refer to as deep meta-reinforcement
                   learning. Previous work has shown that recurrent networks
                   can support meta-learning in a fully supervised context. We
                   extend this approach to the RL setting. What emerges is a
                   system that is trained using one RL algorithm, but whose
                   recurrent dynamics implement a second, quite separate RL
                   procedure. This second, learned RL algorithm can differ from
                   the original one in arbitrary ways. Importantly, because it
                   is learned, it is configured to exploit structure in the
                   training domain. We unpack these points in a series of seven
                   proof-of-concept experiments, each of which examines a key
                   aspect of deep meta-RL. We consider prospects for extending
                   and scaling up the approach, and also point out some
                   potentially important implications for neuroscience.",
  month         =  "17~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1611.05763"
}

@ARTICLE{noauthor_undated-gb,
  title    = "Wide Residual Networks",
  author   = "Zagoruyko, Sergey and Komodakis, Nikos",
  abstract = "Deep residual networks were shown to be able to scale up to
              thousands of layers and still have improving performance.
              However, each fraction of a percent of improved accuracy costs
              nearly doubling the number of layers, and so training very deep
              residual networks has a problem of diminishing feature reuse,
              which makes these networks very slow to train. To tackle these
              problems, in this paper we conduct a detailed experimental study
              on the architecture of ResNet blocks, based on which we propose a
              novel architecture where we decrease depth and increase width of
              residual networks. We call the resulting network structures wide
              residual networks (WRNs) and show that these are far superior
              over their commonly used thin and very deep counterparts. For
              example, we demonstrate that even a simple 16-layer-deep wide
              residual network outperforms in accuracy and efficiency all
              previous deep residual networks, including thousand-layer-deep
              networks, achieving new state-of-the-art results on CIFAR, SVHN,
              COCO, and significant improvements on ImageNet. Our code and
              models are available at
              https://github.com/szagoruyko/wide-residual-networks",
  journal  = "arXiv [cs.CV]",
  month    =  "23~" # may,
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{Ullrich2017-zj,
  title         = "Soft {Weight-Sharing} for Neural Network Compression",
  author        = "Ullrich, Karen and Meeds, Edward and Welling, Max",
  abstract      = "The success of deep learning in numerous application domains
                   created the de- sire to run and train them on mobile
                   devices. This however, conflicts with their computationally,
                   memory and energy intense nature, leading to a growing
                   interest in compression. Recent work by Han et al. (2015a)
                   propose a pipeline that involves retraining, pruning and
                   quantization of neural network weights, obtaining
                   state-of-the-art compression rates. In this paper, we show
                   that competitive compression rates can be achieved by using
                   a version of soft weight-sharing (Nowlan \& Hinton, 1992).
                   Our method achieves both quantization and pruning in one
                   simple (re-)training procedure. This point of view also
                   exposes the relation between compression and the minimum
                   description length (MDL) principle.",
  month         =  "13~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1702.04008"
}

@ARTICLE{Kato2017-og,
  title         = "Mutual Kernel Matrix Completion",
  author        = "Kato, Tsuyoshi and Rivero, Rachelle",
  abstract      = "With the huge influx of various data nowadays, extracting
                   knowledge from them has become an interesting but tedious
                   task among data scientists, particularly when the data come
                   in heterogeneous form and have missing information. Many
                   data completion techniques had been introduced, especially
                   in the advent of kernel methods. However, among the many
                   data completion techniques available in the literature,
                   studies about mutually completing several incomplete kernel
                   matrices have not been given much attention yet. In this
                   paper, we present a new method, called Mutual Kernel Matrix
                   Completion (MKMC) algorithm, that tackles this problem of
                   mutually inferring the missing entries of multiple kernel
                   matrices by combining the notions of data fusion and kernel
                   matrix completion, applied on biological data sets to be
                   used for classification task. We first introduced an
                   objective function that will be minimized by exploiting the
                   EM algorithm, which in turn results to an estimate of the
                   missing entries of the kernel matrices involved. The
                   completed kernel matrices are then combined to produce a
                   model matrix that can be used to further improve the
                   obtained estimates. An interesting result of our study is
                   that the E-step and the M-step are given in closed form,
                   which makes our algorithm efficient in terms of time and
                   memory. After completion, the (completed) kernel matrices
                   are then used to train an SVM classifier to test how well
                   the relationships among the entries are preserved. Our
                   empirical results show that the proposed algorithm bested
                   the traditional completion techniques in preserving the
                   relationships among the data points, and in accurately
                   recovering the missing kernel matrix entries. By far, MKMC
                   offers a promising solution to the problem of mutual
                   estimation of a number of relevant incomplete kernel
                   matrices.",
  month         =  "14~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1702.04077"
}

@ARTICLE{Chen2017-gl,
  title         = "Multitask diffusion adaptation over networks with common
                   latent representations",
  author        = "Chen, Jie and Richard, C{\'e}dric and Sayed, Ali H",
  abstract      = "Online learning with streaming data in a distributed and
                   collaborative manner can be useful in a wide range of
                   applications. This topic has been receiving considerable
                   attention in recent years with emphasis on both single-task
                   and multitask scenarios. In single-task adaptation, agents
                   cooperate to track an objective of common interest, while in
                   multitask adaptation agents track multiple objectives
                   simultaneously. Regularization is one useful technique to
                   promote and exploit similarity among tasks in the latter
                   scenario. This work examines an alternative way to model
                   relations among tasks by assuming that they all share a
                   common latent feature representation. As a result, a new
                   multitask learning formulation is presented and algorithms
                   are developed for its solution in a distributed online
                   manner. We present a unified framework to analyze the
                   mean-square-error performance of the adaptive strategies,
                   and conduct simulations to illustrate the theoretical
                   findings and potential applications.",
  month         =  "13~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.MA",
  eprint        = "1702.03614"
}

@ARTICLE{noauthor_undated-db,
  title    = "Learning Semantic Script Knowledge with Event Embeddings",
  author   = "Modi, Ashutosh and Titov, Ivan",
  abstract = "Induction of common sense knowledge about prototypical sequences
              of events has recently received much attention. Instead of
              inducing this knowledge in the form of graphs, as in much of the
              previous work, in our method, distributed representations of
              event realizations are computed based on distributed
              representations of predicates and their arguments, and then these
              representations are used to predict prototypical event orderings.
              The parameters of the compositional process for computing the
              event representations and the ranking component of the model are
              jointly estimated from texts. We show that this approach results
              in a substantial boost in ordering performance with respect to
              previous methods.",
  journal  = "arXiv [cs.LG]",
  month    =  "18~" # dec,
  year     =  2013,
  keywords = "Latently"
}

@ARTICLE{Reddy2017-fc,
  title         = "Universal Semantic Parsing",
  author        = "Reddy, Siva and T{\"a}ckstr{\"o}m, Oscar and Petrov, Slav
                   and Steedman, Mark and Lapata, Mirella",
  abstract      = "Universal Dependencies (UD) provides a cross-linguistically
                   uniform syntactic representation, with the aim of advancing
                   multilingual applications of parsing and natural language
                   understanding. Reddy et al. (2016) recently developed a
                   semantic interface for (English) Stanford Dependencies,
                   based on the lambda calculus. In this work, we introduce
                   UDepLambda, a similar semantic interface for UD, which
                   allows mapping natural language to logical forms in an
                   almost language-independent framework. We evaluate our
                   approach on semantic parsing for the task of question
                   answering against Freebase. To facilitate multilingual
                   evaluation, we provide German and Spanish translations of
                   the WebQuestions and GraphQuestions datasets. Results show
                   that UDepLambda outperforms strong baselines across
                   languages and datasets. For English, it achieves the
                   strongest result to date on GraphQuestions, with competitive
                   results on WebQuestions.",
  month         =  "10~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.03196"
}

@ARTICLE{Yu2016-dx,
  title         = "Online Segment to Segment Neural Transduction",
  author        = "Yu, Lei and Buys, Jan and Blunsom, Phil",
  abstract      = "We introduce an online neural sequence to sequence model
                   that learns to alternate between encoding and decoding
                   segments of the input as it is read. By independently
                   tracking the encoding and decoding representations our
                   algorithm permits exact polynomial marginalization of the
                   latent segmentation during training, and during decoding
                   beam search is employed to find the best alignment path
                   together with the predicted output sequence. Our model
                   tackles the bottleneck of vanilla encoder-decoders that have
                   to read and memorize the entire input sequence in their
                   fixed-length hidden states before producing any output. It
                   is different from previous attentive models in that, instead
                   of treating the attention weights as output of a
                   deterministic function, our model assigns attention weights
                   to a sequential latent variable which can be marginalized
                   out and permits online generation. Experiments on
                   abstractive sentence summarization and morphological
                   inflection show significant performance gains over the
                   baseline encoder-decoders.",
  month         =  "26~" # sep,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1609.08194"
}

@ARTICLE{Mills2017-sl,
  title         = "Deep learning and the Schr{\"o}dinger equation",
  author        = "Mills, K and Spanner, M and Tamblyn, I",
  abstract      = "We have trained a deep (convolutional) neural network to
                   predict the ground-state energy of an electron in four
                   classes of confining two-dimensional electrostatic
                   potentials. On randomly generated potentials, for which
                   there is no analytic form for either the potential or the
                   ground-state energy, the neural network model was able to
                   predict the ground-state energy to within chemical accuracy,
                   with a median absolute error of 1.49 mHa. We also
                   investigate the performance of the model in predicting other
                   quantities such as the kinetic energy and the first
                   excited-state energy of random potentials. While we
                   demonstrated this approach on a simple, tractable problem,
                   the transferability and excellent performance of the
                   resulting model suggests further applications of deep neural
                   networks to problems of electronic structure.",
  month         =  "5~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cond-mat.mtrl-sci",
  eprint        = "1702.01361"
}

@ARTICLE{Agrawal2016-ei,
  title         = "What is Wrong with Topic Modeling? (and How to Fix it Using
                   Search-based Software Engineering)",
  author        = "Agrawal, Amritanshu and Fu, Wei and Menzies, Tim",
  abstract      = "Context: Topic modeling finds human-readable structures in
                   unstructured textual data. A widely used topic modeler is
                   Latent Dirichlet allocation. When run on different datasets,
                   LDA suffers from ``order effects'' i.e. different topics are
                   generated if the order of training data is shuffled. Such
                   order effects introduce a systematic error for any study.
                   This error can relate to misleading results;specifically,
                   inaccurate topic descriptions and a reduction in the
                   efficacy of text mining classification results. Objective:
                   To provide a method in which distributions generated by LDA
                   are more stable and can be used for further analysis.
                   Method: We use LDADE, a search-based software engineering
                   tool that tunes LDA's parameters using DE (Differential
                   Evolution). LDADE is evaluated on data from a programmer
                   information exchange site (Stackoverflow), title and
                   abstract text of thousands ofSoftware Engineering (SE)
                   papers, and software defect reports from NASA. Results were
                   collected across different implementations of LDA
                   (Python+Scikit-Learn, Scala+Spark); across different
                   platforms (Linux, Macintosh) and for different kinds of LDAs
                   (VEM,or using Gibbs sampling). Results were scored via topic
                   stability and text mining classification accuracy. Results:
                   In all treatments: (i) standard LDA exhibits very large
                   topic instability; (ii) LDADE's tunings dramatically reduce
                   cluster instability; (iii) LDADE also leads to improved
                   performances for supervised as well as unsupervised
                   learning. Conclusion: Due to topic instability, using
                   standard LDA with its ``off-the-shelf'' settings should now
                   be depreciated. Also, in future, we should require SE papers
                   that use LDA to test and (if needed) mitigate LDA topic
                   instability. Finally, LDADE is a candidate technology for
                   effectively and efficiently reducing that instability.",
  month         =  "29~" # aug,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.SE",
  eprint        = "1608.08176"
}

@ARTICLE{Mc_Sherry_undated-au,
  title    = "Scalability! But at what {COST}?",
  author   = "Mc Sherry, Frank and Isard, Michael and Murray Unaffiliated,
              Derek G and Research, Microsoft",
  keywords = "Latently"
}

@ARTICLE{Seo2016-ym,
  title         = "Bidirectional Attention Flow for Machine Comprehension",
  author        = "Seo, Minjoon and Kembhavi, Aniruddha and Farhadi, Ali and
                   Hajishirzi, Hannaneh",
  abstract      = "Machine comprehension (MC), answering a query about a given
                   context paragraph, requires modeling complex interactions
                   between the context and the query. Recently, attention
                   mechanisms have been successfully extended to MC. Typically
                   these methods use attention to focus on a small portion of
                   the context and summarize it with a fixed-size vector,
                   couple attentions temporally, and/or often form a
                   uni-directional attention. In this paper we introduce the
                   Bi-Directional Attention Flow (BIDAF) network, a multi-stage
                   hierarchical process that represents the context at
                   different levels of granularity and uses bi-directional
                   attention flow mechanism to obtain a query-aware context
                   representation without early summarization. Our experimental
                   evaluations show that our model achieves the
                   state-of-the-art results in Stanford Question Answering
                   Dataset (SQuAD) and CNN/DailyMail cloze test.",
  month         =  "5~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1611.01603"
}

@ARTICLE{James_Murdoch2017-qf,
  title         = "Automatic Rule Extraction from Long Short Term Memory
                   Networks",
  author        = "James Murdoch, W and Szlam, Arthur",
  abstract      = "Although deep learning models have proven effective at
                   solving problems in natural language processing, the
                   mechanism by which they come to their conclusions is often
                   unclear. As a result, these models are generally treated as
                   black boxes, yielding no insight of the underlying learned
                   patterns. In this paper we consider Long Short Term Memory
                   networks (LSTMs) and demonstrate a new approach for tracking
                   the importance of a given input to the LSTM for a given
                   output. By identifying consistently important patterns of
                   words, we are able to distill state of the art LSTMs on
                   sentiment analysis and question answering into a set of
                   representative phrases. This representation is then
                   quantitatively validated by using the extracted phrases to
                   construct a simple, rule-based classifier which approximates
                   the output of the LSTM.",
  month         =  "8~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.02540"
}

@ARTICLE{Freitag2017-el,
  title         = "Beam Search Strategies for Neural Machine Translation",
  author        = "Freitag, Markus and Al-Onaizan, Yaser",
  abstract      = "The basic concept in Neural Machine Translation (NMT) is to
                   train a large Neural Network that maximizes the translation
                   performance on a given parallel corpus. NMT is then using a
                   simple left-to-right beam-search decoder to generate new
                   translations that approximately maximize the trained
                   conditional probability. The current beam search strategy
                   generates the target sentence word by word from left-to-
                   right while keeping a fixed amount of active candidates at
                   each time step. First, this simple search is less adaptive
                   as it also expands candidates whose scores are much worse
                   than the current best. Secondly, it does not expand
                   hypotheses if they are not within the best scoring
                   candidates, even if their scores are close to the best one.
                   The latter one can be avoided by increasing the beam size
                   until no performance improvement can be observed. While you
                   can reach better performance, this has the draw- back of a
                   slower decoding speed. In this paper, we concentrate on
                   speeding up the decoder by applying a more flexible beam
                   search strategy whose candidate size may vary at each time
                   step depending on the candidate scores. We speed up the
                   original decoder by up to 43\% for the two language pairs
                   German-English and Chinese-English without losing any
                   translation quality.",
  month         =  "6~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.01806"
}

@ARTICLE{noauthor_undated-yc,
  title    = "Language Models with {Pre-Trained} ({GloVe}) Word Embeddings",
  author   = "Makarenkov, Victor and Shapira, Bracha and Rokach, Lior",
  abstract = "In this work we implement a training of a Language Model (LM),
              using Recurrent Neural Network (RNN) and GloVe word embeddings,
              introduced by Pennigton et al. in [1]. The implementation is
              following the general idea of training RNNs for LM tasks
              presented in [2], but is rather using Gated Recurrent Unit (GRU)
              [3] for a memory cell, and not the more commonly used LSTM [4].",
  journal  = "arXiv [cs.CL]",
  month    =  "12~" # oct,
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{Looks_undated-zr,
  title    = "{DEEP} {LEARNING} {WITH} {DYNAMIC} {COMPUTATION} {GRAPHS}",
  author   = "Looks, Moshe and Herreshoff, Marcello and Hutchins, De Lesley and
              Norvig, Peter",
  keywords = "Latently"
}

@ARTICLE{Malmi2017-eh,
  title         = "Automatic Prediction of Discourse Connectives",
  author        = "Malmi, Eric and Pighin, Daniele and Krause, Sebastian and
                   Kozhevnikov, Mikhail",
  abstract      = "Accurate prediction of suitable discourse connectives
                   (however, furthermore, etc.) is a key component of any
                   system aimed at building coherent and fluent discourses from
                   shorter sentences and passages. As an example, a dialog
                   system might assemble a long and informative answer by
                   sampling passages extracted from different documents
                   retrieved from the web. We formulate the task of discourse
                   connective prediction and release a dataset of 2.9M sentence
                   pairs separated by discourse connectives for this task.
                   Then, we evaluate the hardness of the task for human raters,
                   apply a recently proposed decomposable attention (DA) model
                   to this task and observe that the automatic predictor has a
                   higher F1 than human raters (32 vs. 30). Nevertheless, under
                   specific conditions the raters still outperform the DA
                   model, suggesting that there is headroom for future
                   improvements. Finally, we further demonstrate the usefulness
                   of the connectives dataset by showing that it improves
                   implicit discourse relation prediction when used for model
                   pre-training.",
  month         =  "3~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.00992"
}

@ARTICLE{noauthor_undated-sv,
  title    = "{QCD-Aware} Recursive Neural Networks for Jet Physics",
  author   = "Louppe, Gilles and Cho, Kyunghyun and Becot, Cyril and Cranmer,
              Kyle",
  abstract = "Recent progress in applying machine learning for jet physics has
              been built upon an analogy between calorimeters and images. In
              this work, we present a novel class of recursive neural networks
              built instead upon an analogy between QCD and natural languages.
              In the analogy, four-momenta are like words and the clustering
              history of sequential recombination jet algorithms is like the
              parsing of a sentence. Our approach works directly with the
              four-momenta of a variable-length set of particles, and the
              jet-based tree structure varies on an event-by-event basis. Our
              experiments highlight the flexibility of our method for building
              task-specific jet embeddings and show that recursive
              architectures are significantly more accurate and data efficient
              than previous image-based networks. We extend the analogy from
              individual jets (sentences) to full events (paragraphs), and show
              for the first time an event-level classifier operating on all the
              stable particles produced in an LHC event.",
  journal  = "arXiv [hep-ph]",
  month    =  "2~" # feb,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{noauthor_undated-ag,
  title    = "Multilingual and Cross-lingual Timeline Extraction",
  author   = "Laparra, Egoitz and Agerri, Rodrigo and Aldabe, Itziar and Rigau,
              German",
  abstract = "In this paper we present an approach to extract ordered timelines
              of events, their participants, locations and times from a set of
              multilingual and cross-lingual data sources. Based on the
              assumption that event-related information can be recovered from
              different documents written in different languages, we extend the
              Cross-document Event Ordering task presented at SemEval 2015 by
              specifying two new tasks for, respectively, Multilingual and
              Cross-lingual Timeline Extraction. We then develop three
              deterministic algorithms for timeline extraction based on two
              main ideas. First, we address implicit temporal relations at
              document level since explicit time-anchors are too scarce to
              build a wide coverage timeline extraction system. Second, we
              leverage several multilingual resources to obtain a single,
              inter-operable, semantic representation of events across
              documents and across languages. The result is a highly
              competitive system that strongly outperforms the current
              state-of-the-art. Nonetheless, further analysis of the results
              reveals that linking the event mentions with their target
              entities and time-anchors remains a difficult challenge. The
              systems, resources and scorers are freely available to facilitate
              its use and guarantee the reproducibility of results.",
  journal  = "arXiv [cs.CL]",
  month    =  "2~" # feb,
  year     =  2017,
  keywords = "Latently"
}

@ARTICLE{Hwang2016-fk,
  title         = "{Character-Level} Language Modeling with Hierarchical
                   Recurrent Neural Networks",
  author        = "Hwang, Kyuyeon and Sung, Wonyong",
  abstract      = "Recurrent neural network (RNN) based character-level
                   language models (CLMs) are extremely useful for modeling
                   out-of-vocabulary words by nature. However, their
                   performance is generally much worse than the word-level
                   language models (WLMs), since CLMs need to consider longer
                   history of tokens to properly predict the next one. We
                   address this problem by proposing hierarchical RNN
                   architectures, which consist of multiple modules with
                   different timescales. Despite the multi-timescale
                   structures, the input and output layers operate with the
                   character-level clock, which allows the existing RNN CLM
                   training approaches to be directly applicable without any
                   modifications. Our CLM models show better perplexity than
                   Kneser-Ney (KN) 5-gram WLMs on the One Billion Word
                   Benchmark with only 2\% of parameters. Also, we present
                   real-time character-level end-to-end speech recognition
                   examples on the Wall Street Journal (WSJ) corpus, where
                   replacing traditional mono-clock RNN CLMs with the proposed
                   models results in better recognition accuracies even though
                   the number of parameters are reduced to 30\%.",
  month         =  "13~" # sep,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1609.03777"
}

@ARTICLE{Ferrone2017-ko,
  title         = "Symbolic, Distributed and Distributional Representations for
                   Natural Language Processing in the Era of Deep Learning: a
                   Survey",
  author        = "Ferrone, Lorenzo and Zanzotto, Fabio Massimo",
  abstract      = "Natural language and symbols are intimately correlated.
                   Recent advances in machine learning (ML) and in natural
                   language processing (NLP) seem to contradict the above
                   intuition: symbols are fading away, erased by vectors or
                   tensors called distributed and distributional
                   representations. However, there is a strict link between
                   distributed/distributional representations and symbols,
                   being the first an approximation of the second. A clearer
                   understanding of the strict link between
                   distributed/distributional representations and symbols will
                   certainly lead to radically new deep learning networks. In
                   this paper we make a survey that aims to draw the link
                   between symbolic representations and
                   distributed/distributional representations. This is the
                   right time to revitalize the area of interpreting how
                   symbols are represented inside neural networks.",
  month         =  "2~" # feb,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1702.00764"
}

@ARTICLE{Bojanowski2016-gu,
  title         = "Enriching Word Vectors with Subword Information",
  author        = "Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and
                   Mikolov, Tomas",
  abstract      = "Continuous word representations, trained on large unlabeled
                   corpora are useful for many natural language processing
                   tasks. Many popular models to learn such representations
                   ignore the morphology of words, by assigning a distinct
                   vector to each word. This is a limitation, especially for
                   morphologically rich languages with large vocabularies and
                   many rare words. In this paper, we propose a new approach
                   based on the skip-gram model, where each word is represented
                   as a bag of character n-grams. A vector representation is
                   associated to each character n-gram, words being represented
                   as the sum of these representations. Our method is fast,
                   allowing to train models on large corpus quickly. We
                   evaluate the obtained word representations on five different
                   languages, on word similarity and analogy tasks.",
  month         =  "15~" # jul,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1607.04606"
}

@ARTICLE{Kadlec_undated-ww,
  title    = "From Particular to General: A Preliminary Case Study of Transfer
              Learning in Reading Comprehension",
  author   = "Kadlec, Rudolf and Bajgar, Ondrej and Kleindienst, Jan",
  keywords = "Latently"
}

@INPROCEEDINGS{Erxleben2014-yh,
  title      = "Introducing Wikidata to the Linked Data Web",
  booktitle  = "The Semantic Web -- {ISWC} 2014",
  author     = "Erxleben, Fredo and G{\"u}nther, Michael and Kr{\"o}tzsch,
                Markus and Mendez, Julian and Vrande{\v c}i{\'c}, Denny",
  editor     = "Mika, Peter and Tudorache, Tania and Bernstein, Abraham and
                Welty, Chris and Knoblock, Craig and Vrande{\v c}i{\'c}, Denny
                and Groth, Paul and Noy, Natasha and Janowicz, Krzysztof and
                Goble, Carole",
  abstract   = "Wikidata is the central data management platform of Wikipedia.
                By the efforts of thousands of volunteers, the project has
                produced a large, open knowledge base with many interesting
                applications. The data is highly interlinked and connected to
                many other datasets, but it is also very rich, complex, and not
                available in RDF. To address this issue, we introduce new RDF
                exports that connect Wikidata to the Linked Data Web. We
                explain the data model of Wikidata and discuss its encoding in
                RDF. Moreover, we introduce several partial exports that
                provide more selective or simplified views on the data. This
                includes a class hierarchy and several other types of
                ontological axioms that we extract from the site. All datasets
                we discuss here are freely available online and updated
                regularly.",
  publisher  = "Springer International Publishing",
  pages      = "50--65",
  series     = "Lecture Notes in Computer Science",
  month      =  "19~" # oct,
  year       =  2014,
  keywords   = "Latently",
  language   = "en",
  conference = "International Semantic Web Conference"
}

@ARTICLE{Beza2017-bh,
  title    = "Review of yield gap explaining factors and opportunities for
              alternative data collection approaches",
  author   = "Beza, Eskender and Silva, Jo{\~a}o Vasco and Kooistra, Lammert
              and Reidsma, Pytrik",
  abstract = "Abstract Yield gap analysis is gaining increased scientific
              attention, as estimating and explaining yield gaps shows the
              potential for sustainable intensification of agricultural
              systems. Explaining yield gaps requires detailed information
              about the biophysical environment, crop management as well as
              farm(er) characteristics and socio-economic conditions in which
              farmers operate. However, these types of data are not always
              available, mostly because they are costly to collect. The main
              objective of this research is to assess data availability and
              data collection approaches for yield gap analysis, and to
              summarize the yield gap explaining factors identified by previous
              studies. For this purpose, a review of yield gap studies (50
              agronomic-based peer-reviewed articles) was performed to identify
              the most commonly considered and explaining factors of the yield
              gap. Besides a global comparison, differences between regions,
              crops and methods were analysed as well. The results show that
              management and edaphic factors are more often considered to
              explain the yield gap compared to farm(er) characteristics and
              socio-economic factors. However, when considered, both farm(er)
              characteristics and socio-economic factors often explain the
              yield gap. Fertilization and soil fertility factors are the most
              often considered management and edaphic factors. In the
              fertilization group, factors related to quantity (e.g. N
              fertilizer quantity) are more often considered compared to
              factors related to timing (e.g. N fertilizer timing). However,
              when considered, timing explained the yield gap more often.
              Explaining factors vary among regions and crops. For example,
              while soil fertility is considered relatively much both in Africa
              and Asia, it is often explaining in Africa, but not in Asia.
              Agronomic methods like crop growth simulation models are often
              used for yield gap analysis, but are limited in the type and
              number of factors that can be included. Qualitative methods based
              on expert knowledge can include the largest range of factors.
              Although the data included in yield gap analysis also depends on
              the objective, knowledge of explaining factors, and methods
              applied, data availability is a major limiting factor. Bottom-up
              data collection approaches (e.g. crowdsourcing) involving
              agricultural communities can provide alternatives to overcome
              this limitation and improve yield gap analysis.",
  journal  = "Eur. J. Agron.",
  volume   = "82, Part B",
  pages    = "206--222",
  year     =  2017,
  keywords = "Yield variability; Potential yield; Actual yield; Benchmarking;
              Crowdsourcing; Data collection;Latently"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{noauthor_2016-fo,
  title        = "‘Early Days’ For Alternative Data",
  booktitle    = "Markets Media",
  abstract     = "Off-the-beaten-path data will be better and more widely used
                  in coming years. (Sponsored by Dataminr)",
  publisher    = "Markets Media",
  month        =  "7~" # dec,
  year         =  2016,
  howpublished = "\url{http://marketsmedia.com/early-days-alternative-data/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently;Latently/Web"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{noauthor_2017-ov,
  title        = "A Building Year for ‘Alt’ Data",
  booktitle    = "Markets Media",
  abstract     = "One manager expects alternative data to crawl, not walk, in
                  2017.",
  publisher    = "Markets Media",
  month        =  "9~" # jan,
  year         =  2017,
  howpublished = "\url{http://marketsmedia.com/building-year-alt-data/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently;Latently/Web"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{noauthor_2016-lb,
  title        = "‘Alt’ Data as Risk Manager",
  booktitle    = "Markets Media",
  abstract     = "It's about leveraging the data that not everyone else is
                  looking at to get ahead of market volatility.",
  publisher    = "Markets Media",
  month        =  "3~" # nov,
  year         =  2016,
  howpublished = "\url{http://marketsmedia.com/alt-data-risk-manager/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently;Latently/Web"
}

@MISC{noauthor_undated-po,
  title        = "Wallscope",
  booktitle    = "Wallscope",
  abstract     = "dynanic data discovery platform",
  howpublished = "\url{http://wallscope.co.uk/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently"
}

@MISC{undated-pg,
  title        = "{AI} Pioneer Wants to Build the Renaissance Machine of the
                  Future",
  booktitle    = "Bloomberg.com",
  author       = "{More Stories by} and {More Stories by}",
  abstract     = "Juergen Schmidhuber, often referred to as the father of AI,
                  raised venture capital for his startup looking to teach
                  machines to solve any problem.",
  howpublished = "\url{https://www.bloomberg.com/news/articles/2017-01-16/ai-pioneer-wants-to-build-the-renaissance-machine-of-the-future}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently"
}

@MISC{noauthor_undated-if,
  title        = "Big data sellers fail to hide personal information, say funds",
  booktitle    = "Financial Times",
  howpublished = "\url{https://www.ft.com/content/d35b9272-b583-11e6-ba85-95d1533d9a62}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently;Latently/Web"
}

@MISC{noauthor_undated-bb,
  title        = "Gnip",
  booktitle    = "gnip.com",
  howpublished = "\url{https://gnip.com/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently"
}

@MISC{noauthor_undated-zi,
  title        = "System2",
  booktitle    = "System2",
  abstract     = "Data-driven answers to investment questions.",
  howpublished = "\url{http://www.sstm2.com/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently"
}

@MISC{noauthor_undated-qs,
  title        = "crunchbase",
  booktitle    = "crunchbase",
  abstract     = "Discover innovative companies and the people behind them.",
  howpublished = "\url{https://www.crunchbase.com/app/lists}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently"
}

@MISC{noauthor_undated-pb,
  title        = "Home - Point72 Asset Management",
  booktitle    = "Point72 Asset Management",
  howpublished = "\url{https://www.point72.com/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently"
}

@MISC{noauthor_undated-vn,
  title        = "altdata",
  booktitle    = "altdata",
  howpublished = "\url{http://www.altdata.co/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently"
}

@MISC{noauthor_undated-lh,
  title        = "Eagle Alpha - Alpha from Alternative {DataEagle} Alpha
                  enables Asset Managers to obtain Alpha from Alternative Data",
  booktitle    = "Eagle Alpha - Alpha from Alternative Data",
  abstract     = "Eagle Alpha enables Asset Managers to obtain Alpha from
                  Alternative Data.",
  howpublished = "\url{https://eaglealpha.com/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently"
}

@MISC{noauthor_undated-wg,
  title        = "Quandl",
  abstract     = "The source for financial, economic, and alternative datasets,
                  serving investment professionals.",
  howpublished = "\url{https://www.quandl.com/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently/Web/Alt-Data companies;Latently"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{undated-si,
  title        = "Cohen’s Point72 Hires 30 People for Big Data Investing",
  booktitle    = "Bloomberg.com",
  author       = "{More Stories by} and {More Stories by}",
  abstract     = "Steven Cohen’s investment firm is looking for an edge in
                  public data.",
  howpublished = "\url{https://www.bloomberg.com/news/articles/2015-03-10/cohen-s-firm-said-to-hire-30-seeking-edge-in-public-data}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently;Latently/Web"
}

@MISC{noauthor_undated-pr,
  title        = "The {AI} 100",
  booktitle    = "The {AI} 100",
  abstract     = "A ranking of the 100 most promising private artificial
                  intelligence companies globally.",
  howpublished = "\url{https://www.cbinsights.com/research-ai-100}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently;Latently/Web"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@MISC{noauthor_2017-pt,
  title        = "The New Gold Rush? Wall Street Wants your Data",
  booktitle    = "Matt Turck",
  abstract     = "A few months ago, Foursquare achieved an impressive feat by
                  predicting, ahead of official company results, that
                  Chipotle’s Q1 2016 sales would be down nearly 30\%. Because
                  it captures geo-…",
  month        =  "17~" # jan,
  year         =  2017,
  howpublished = "\url{http://mattturck.com/2017/01/17/the-new-gold-rush-wall-street-wants-your-data/}",
  note         = "Accessed: 2017-1-20",
  keywords     = "Latently;Latently/Web"
}

@ARTICLE{Zoph2016-dj,
  title         = "Neural Architecture Search with Reinforcement Learning",
  author        = "Zoph, Barret and Le, Quoc V",
  abstract      = "Neural networks are powerful and flexible models that work
                   well for many difficult learning tasks in image, speech and
                   natural language understanding. Despite their success,
                   neural networks are still hard to design. In this paper, we
                   use a recurrent network to generate the model descriptions
                   of neural networks and train this RNN with reinforcement
                   learning to maximize the expected accuracy of the generated
                   architectures on a validation set. On the CIFAR-10 dataset,
                   our method, starting from scratch, can design a novel
                   network architecture that rivals the best human-invented
                   architecture in terms of test set accuracy. Our CIFAR-10
                   model achieves a test error rate of 3.84, which is only 0.1
                   percent worse and 1.2x faster than the current
                   state-of-the-art model. On the Penn Treebank dataset, our
                   model can compose a novel recurrent cell that outperforms
                   the widely-used LSTM cell, and other state-of-the-art
                   baselines. Our cell achieves a test set perplexity of 62.4
                   on the Penn Treebank, which is 3.6 perplexity better than
                   the previous state-of-the-art.",
  month         =  "5~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1611.01578"
}

@ARTICLE{Goldberg2016-zo,
  title    = "A primer on neural network models for natural language processing",
  author   = "Goldberg, Yoav",
  journal  = "J. Artif. Intell. Res.",
  volume   =  57,
  pages    = "345--420",
  year     =  2016,
  keywords = "Latently"
}

@ARTICLE{Yaghoobzadeh2017-xu,
  title         = "Multi-level Representations for {Fine-Grained} Typing of
                   Knowledge Base Entities",
  author        = "Yaghoobzadeh, Yadollah and Sch{\"u}tze, Hinrich",
  abstract      = "Entities are essential elements of natural language. In this
                   paper, we present methods for learning multi-level
                   representations of entities on three complementary levels:
                   character (character patterns in entity names extracted,
                   e.g., by neural networks), word (embeddings of words in
                   entity names) and entity (entity embeddings). We investigate
                   state-of-the-art learning methods on each level and find
                   large differences, e.g., for deep learning models,
                   traditional ngram features and the subword model of fasttext
                   (Bojanowski et al., 2016) on the character level; for
                   word2vec (Mikolov et al., 2013) on the word level; and for
                   the order-aware model wang2vec (Ling et al., 2015a) on the
                   entity level. We confirm experimentally that each level of
                   representation contributes complementary information and a
                   joint representation of all three levels improves the
                   existing embedding based baseline for fine-grained entity
                   typing by a large margin. Additionally, we show that adding
                   information from entity descriptions further improves
                   multi-level representations of entities.",
  month         =  "8~" # jan,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1701.02025"
}

@ARTICLE{Nothman2013-pi,
  title     = "Grounding event references in news",
  author    = "Nothman, Joel",
  publisher = "University of Sydney",
  year      =  2013,
  keywords  = "Latently"
}

@INPROCEEDINGS{Girardi2014-yt,
  title     = "{CROMER}: a Tool for {Cross-Document} Event and Entity
               Coreference",
  booktitle = "{LREC}",
  author    = "Girardi, Christian and Speranza, Manuela and Sprugnoli, Rachele
               and Tonelli, Sara",
  pages     = "3204--3208",
  year      =  2014,
  keywords  = "Latently"
}

@ARTICLE{Bejan2014-mt,
  title    = "Unsupervised Event Coreference Resolution",
  author   = "Bejan, Cosmin Adrian and Harabagiu, Sanda",
  abstract = "The task of event coreference resolution plays a critical role in
              many natural language processing applications such as information
              extraction, question answering, and topic detection and tracking.
              In this article, we describe a new class of unsupervised,
              nonparametric Bayesian models with the purpose of
              probabilistically inferring coreference clusters of event
              mentions from a collection of unlabeled documents. In order to
              infer these clusters, we automatically extract various lexical,
              syntactic, and semantic features for each event mention from the
              document collection. Extracting a rich set of features for each
              event mention allows us to cast event coreference resolution as
              the task of grouping together the mentions that share the same
              features (they have the same participating entities, share the
              same location, happen at the same time, etc.).Some of the most
              important challenges posed by the resolution of event coreference
              in an unsupervised way stem from (a) the choice of representing
              event mentions through a rich set of features and (b) the ability
              of modeling events described both within the same document and
              across multiple documents. Our first unsupervised model that
              addresses these challenges is a generalization of the
              hierarchical Dirichlet process. This new extension presents the
              hierarchical Dirichlet process's ability to capture the
              uncertainty regarding the number of clustering components and,
              additionally, takes into account any finite number of features
              associated with each event mention. Furthermore, to overcome some
              of the limitations of this extension, we devised a new hybrid
              model, which combines an infinite latent class model with a
              discrete time series model. The main advantage of this hybrid
              model stands in its capability to automatically infer the number
              of features associated with each event mention from data and, at
              the same time, to perform an automatic selection of the most
              informative features for the task of event coreference. The
              evaluation performed for solving both within- and cross-document
              event coreference shows significant improvements of these models
              when compared against two baselines for this task.",
  journal  = "Comput. Linguist.",
  volume   =  40,
  number   =  2,
  pages    = "311--347",
  year     =  2014,
  keywords = "Latently"
}

@INPROCEEDINGS{Lee2012-st,
  title     = "Joint Entity and Event Coreference Resolution Across Documents",
  booktitle = "Proceedings of the 2012 Joint Conference on Empirical Methods in
               Natural Language Processing and Computational Natural Language
               Learning",
  author    = "Lee, Heeyoung and Recasens, Marta and Chang, Angel and Surdeanu,
               Mihai and Jurafsky, Dan",
  publisher = "Association for Computational Linguistics",
  pages     = "489--500",
  series    = "EMNLP-CoNLL '12",
  year      =  2012,
  address   = "Stroudsburg, PA, USA",
  keywords  = "Latently"
}

@ARTICLE{Tran2017-eo,
  title         = "Deep Probabilistic Programming",
  author        = "Tran, Dustin and Hoffman, Matthew D and Saurous, Rif A and
                   Brevdo, Eugene and Murphy, Kevin and Blei, David M",
  abstract      = "We propose Edward, a Turing-complete probabilistic
                   programming language. Edward builds on two compositional
                   representations---random variables and inference. By
                   treating inference as a first class citizen, on a par with
                   modeling, we show that probabilistic programming can be as
                   flexible and computationally efficient as traditional deep
                   learning. For flexibility, Edward makes it easy to fit the
                   same model using a variety of composable inference methods,
                   ranging from point estimation, to variational inference, to
                   MCMC. In addition, Edward can reuse the modeling
                   representation as part of inference, facilitating the design
                   of rich variational models and generative adversarial
                   networks. For efficiency, Edward is integrated into
                   TensorFlow, providing significant speedups over existing
                   probabilistic systems. For example, on a benchmark logistic
                   regression task, Edward is at least 35x faster than Stan and
                   PyMC3.",
  month         =  "13~" # jan,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1701.03757"
}

@INCOLLECTION{Dean2012-uu,
  title     = "Large Scale Distributed Deep Networks",
  booktitle = "Advances in Neural Information Processing Systems 25",
  author    = "Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai
               and Devin, Matthieu and Mao, Mark and aurelio Ranzato,
               Marc\textbackslashtextquotesingle and Senior, Andrew and Tucker,
               Paul and Yang, Ke and Le, Quoc V and Ng, Andrew Y",
  editor    = "Pereira, F and Burges, C J C and Bottou, L and Weinberger, K Q",
  publisher = "Curran Associates, Inc.",
  pages     = "1223--1231",
  year      =  2012,
  keywords  = "Latently"
}

@ARTICLE{Al-Shedivat2016-dp,
  title         = "Learning Scalable Deep Kernels with Recurrent Structure",
  author        = "Al-Shedivat, Maruan and Wilson, Andrew Gordon and Saatchi,
                   Yunus and Hu, Zhiting and Xing, Eric P",
  abstract      = "Many applications in speech, robotics, finance, and biology
                   deal with sequential data, where ordering matters and
                   recurrent structures are common. However, this structure
                   cannot be easily captured by standard kernel functions. To
                   model such structure, we propose expressive closed-form
                   kernel functions for Gaussian processes. The resulting
                   model, GP-LSTM, fully encapsulates the inductive biases of
                   long short-term memory (LSTM) recurrent networks, while
                   retaining the non-parametric probabilistic advantages of
                   Gaussian processes. We learn the properties of the proposed
                   kernels by optimizing the Gaussian process marginal
                   likelihood using a new provably convergent semi-stochastic
                   procedure and exploit the structure of these kernels for
                   fast and scalable training and prediction. We demonstrate
                   state-of-the-art performance on several benchmarks, and
                   thoroughly investigate a consequential autonomous driving
                   application, where the predictive uncertainties provided by
                   GP-LSTM are uniquely valuable.",
  month         =  "27~" # oct,
  year          =  2016,
  keywords      = "Latently/Code/OkToUse;Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1610.08936"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{OGorman2016-tt,
  title     = "Richer Event Description: Integrating event coreference with
               temporal, causal and bridging annotation",
  author    = "O’Gorman, Tim and Wright-Bettner, Kristin and Palmer, Martha",
  abstract  = "Abstract There have been a wide range of recent annotated
               corpora concerning events, either regarding event coreference,
               the temporal order of events, hierarchical ``subevent''
               structure of events, or causal relationships between events.
               However, although some",
  journal   = "Computing News Storylines",
  publisher = "aclweb.org",
  pages     = "47",
  year      =  2016,
  keywords  = "Latently"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Hong2016-ds,
  title     = "Building a Cross-document {Event-Event} Relation Corpus",
  author    = "Hong, Yu and Zhang, Tongtao and O’Gorman, Tim and
               Horowit-Hendler, Sharone and Ji, Heng and Palmer, Martha",
  abstract  = "Abstract We propose a new task of extracting eventevent
               relations across documents. We present our efforts at designing
               an annotation schema and building a corpus for this task. Our
               schema includes five main types of relations: Inheritance,
               Expansion, Contingency, Comparison and Temporality, along with
               21 subtypes. We also lay out the main challenges based on
               detailed inter-annotator disagreement and error analysis. We
               hope these ...",
  journal   = "LAW X",
  publisher = "aclweb.org",
  pages     = "1",
  year      =  2016,
  keywords  = "Latently"
}

@ARTICLE{Munkhdalai2016-tf,
  title         = "Neural Semantic Encoders",
  author        = "Munkhdalai, Tsendsuren and Yu, Hong",
  abstract      = "We present a memory augmented neural network for natural
                   language understanding: Neural Semantic Encoders. NSE is
                   equipped with a novel memory update rule and has a variable
                   sized encoding memory that evolves over time and maintains
                   the understanding of input sequences through read\}, compose
                   and write operations. NSE can also access multiple and
                   shared memories. In this paper, we demonstrated the
                   effectiveness and the flexibility of NSE on five different
                   natural language tasks: natural language inference, question
                   answering, sentence classification, document sentiment
                   analysis and machine translation where NSE achieved
                   state-of-the-art performance when evaluated on publically
                   available benchmarks. For example, our shared-memory model
                   showed an encouraging result on neural machine translation,
                   improving an attention-based baseline by approximately 1.0
                   BLEU.",
  month         =  "14~" # jul,
  year          =  2016,
  keywords      = "Latently/Code/OkToUse;Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1607.04315"
}

@INPROCEEDINGS{Hovy2006-wx,
  title     = "{OntoNotes}: The 90\% Solution",
  booktitle = "Proceedings of the Human Language Technology Conference of the
               {NAACL}, Companion Volume: Short Papers",
  author    = "Hovy, Eduard and Marcus, Mitchell and Palmer, Martha and
               Ramshaw, Lance and Weischedel, Ralph",
  publisher = "Association for Computational Linguistics",
  pages     = "57--60",
  series    = "NAACL-Short '06",
  year      =  2006,
  address   = "Stroudsburg, PA, USA",
  keywords  = "Latently;Latently/Data/OkToUse"
}

@INPROCEEDINGS{Niles2001-xy,
  title     = "Towards a Standard Upper Ontology",
  booktitle = "Proceedings of the International Conference on Formal Ontology
               in Information Systems - Volume 2001",
  author    = "Niles, Ian and Pease, Adam",
  publisher = "ACM",
  pages     = "2--9",
  series    = "FOIS '01",
  year      =  2001,
  address   = "New York, NY, USA",
  keywords  = "knowledge interchange format, ontologies;Latently;Latently/Must
               Read;Latently/Data/DoNotUse"
}

@INPROCEEDINGS{Suchanek2007-qv,
  title     = "Yago: A Core of Semantic Knowledge",
  booktitle = "Proceedings of the 16th International Conference on World Wide
               Web",
  author    = "Suchanek, Fabian M and Kasneci, Gjergji and Weikum, Gerhard",
  publisher = "ACM",
  pages     = "697--706",
  series    = "WWW '07",
  year      =  2007,
  address   = "New York, NY, USA",
  keywords  = "WordNet, wikipedia;Latently/Data/OkToUse;Latently;Latently/Data"
}

@BOOK{Lenat1989-pk,
  title     = "Building Large {Knowledge-Based} Systems; Representation and
               Inference in the Cyc Project",
  author    = "Lenat, Douglas B and Guha, R V",
  publisher = "Addison-Wesley Longman Publishing Co., Inc.",
  edition   = "1st",
  year      =  1989,
  address   = "Boston, MA, USA",
  keywords  = "Latently;Latently/Data/OkToUse;Latently/Must Read;Latently/Data"
}

@ARTICLE{Vrandecic2014-ha,
  title     = "Wikidata: A Free Collaborative Knowledgebase",
  author    = "Vrande{\v c}i{\'c}, Denny and Kr{\"o}tzsch, Markus",
  journal   = "Commun. ACM",
  publisher = "ACM",
  volume    =  57,
  number    =  10,
  pages     = "78--85",
  month     =  sep,
  year      =  2014,
  address   = "New York, NY, USA",
  keywords  = "Latently;Latently/Data/OkToUse;Latently/Must Read;Latently/Data"
}

@INCOLLECTION{Auer2007-vy,
  title     = "{DBpedia}: A Nucleus for a Web of Open Data",
  booktitle = "The Semantic Web",
  author    = "Auer, S{\"o}ren and Bizer, Christian and Kobilarov, Georgi and
               Lehmann, Jens and Cyganiak, Richard and Ives, Zachary",
  abstract  = "DBpedia is a community effort to extract structured information
               from Wikipedia and to make this information available on the
               Web. DBpedia allows you to ask sophisticated queries against
               datasets derived from Wikipedia and to link other datasets on
               the Web to Wikipedia data. We describe the extraction of the
               DBpedia datasets, and how the resulting information is published
               on the Web for human- and machine-consumption. We describe some
               emerging applications from the DBpedia community and show how
               website authors can facilitate DBpedia content within their
               sites. Finally, we present the current status of interlinking
               DBpedia with other open datasets on the Web and outline how
               DBpedia could serve as a nucleus for an emerging Web of open
               data.",
  publisher = "Springer, Berlin, Heidelberg",
  pages     = "722--735",
  year      =  2007,
  keywords  = "Latently;Latently/Data/OkToUse;Latently/Must Read;Latently/Data",
  language  = "en"
}

@ARTICLE{Beieler2016-sn,
  title         = "Creating a {Real-Time}, Reproducible Event Dataset",
  author        = "Beieler, John",
  abstract      = "The generation of political event data has remained much the
                   same since the mid-1990s, both in terms of data acquisition
                   and the process of coding text into data. Since the 1990s,
                   however, there have been significant improvements in
                   open-source natural language processing software and in the
                   availability of digitized news content. This paper presents
                   a new, next-generation event dataset, named Phoenix, that
                   builds from these and other advances. This dataset includes
                   improvements in the underlying news collection process and
                   event coding software, along with the creation of a general
                   processing pipeline necessary to produce daily-updated data.
                   This paper provides a face validity checks by briefly
                   examining the data for the conflict in Syria, and a
                   comparison between Phoenix and the Integrated Crisis Early
                   Warning System data.",
  month         =  "2~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.00866"
}

@ARTICLE{Liang2016-zj,
  title         = "Neural Symbolic Machines: Learning Semantic Parsers on
                   Freebase with Weak Supervision (Short Version)",
  author        = "Liang, Chen and Berant, Jonathan and Le, Quoc and Forbus,
                   Kenneth D and Lao, Ni",
  abstract      = "Extending the success of deep neural networks to natural
                   language understanding and symbolic reasoning requires
                   complex operations and external memory. Recent neural
                   program induction approaches have attempted to address this
                   problem, but are typically limited to differentiable memory,
                   and consequently cannot scale beyond small synthetic tasks.
                   In this work, we propose the Manager-Programmer-Computer
                   framework, which integrates neural networks with
                   non-differentiable memory to support abstract, scalable and
                   precise operations through a friendly neural computer
                   interface. Specifically, we introduce a Neural Symbolic
                   Machine, which contains a sequence-to-sequence neural
                   ``programmer'', and a non-differentiable ``computer'' that
                   is a Lisp interpreter with code assist. To successfully
                   apply REINFORCE for training, we augment it with approximate
                   gold programs found by an iterative maximum likelihood
                   training process. NSM is able to learn a semantic parser
                   from weak supervision over a large knowledge base. It
                   achieves new state-of-the-art performance on WebQuestionsSP,
                   a challenging semantic parsing dataset, with weak
                   supervision. Compared to previous approaches, NSM is
                   end-to-end, therefore does not rely on feature engineering
                   or domain specific knowledge.",
  month         =  "4~" # dec,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.01197"
}

@ARTICLE{Joulin2016-cl,
  title         = "{FastText.zip}: Compressing text classification models",
  author        = "Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and
                   Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas",
  abstract      = "We consider the problem of producing compact architectures
                   for text classification, such that the full model fits in a
                   limited amount of memory. After considering different
                   solutions inspired by the hashing literature, we propose a
                   method built upon product quantization to store word
                   embeddings. While the original technique leads to a loss in
                   accuracy, we adapt this method to circumvent quantization
                   artefacts. Our experiments carried out on several benchmarks
                   show that our approach typically requires two orders of
                   magnitude less memory than fastText while being only
                   slightly inferior with respect to accuracy. As a result, it
                   outperforms the state of the art by a good margin in terms
                   of the compromise between memory usage and accuracy.",
  month         =  "12~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.03651"
}

@ARTICLE{Henaff2016-fz,
  title         = "Tracking the World State with Recurrent Entity Networks",
  author        = "Henaff, Mikael and Weston, Jason and Szlam, Arthur and
                   Bordes, Antoine and LeCun, Yann",
  abstract      = "We introduce a new model, the Recurrent Entity Network
                   (EntNet). It is equipped with a dynamic long-term memory
                   which allows it to maintain and update a representation of
                   the state of the world as it receives new data. For language
                   understanding tasks, it can reason on-the-fly as it reads
                   text, not just when it is required to answer a question or
                   respond as is the case for a Memory Network (Sukhbaatar et
                   al., 2015). Like a Neural Turing Machine or Differentiable
                   Neural Computer (Graves et al., 2014; 2016) it maintains a
                   fixed size memory and can learn to perform location and
                   content-based read and write operations. However, unlike
                   those models it has a simple parallel architecture in which
                   several memory locations can be updated simultaneously. The
                   EntNet sets a new state-of-the-art on the bAbI tasks, and is
                   the first method to solve all the tasks in the 10k training
                   examples setting. We also demonstrate that it can solve a
                   reasoning task which requires a large number of supporting
                   facts, which other methods are not able to solve, and can
                   generalize past its training horizon. It can also be
                   practically used on large scale datasets such as Children's
                   Book Test, where it obtains competitive performance, reading
                   the story in a single pass.",
  month         =  "12~" # dec,
  year          =  2016,
  keywords      = "Latently;Latently/Must
                   Read/Essentials;Latently/Code/DoNotUse;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.03969"
}

@ARTICLE{Speer2016-ab,
  title         = "{ConceptNet} 5.5: An Open Multilingual Graph of General
                   Knowledge",
  author        = "Speer, Robert and Chin, Joshua and Havasi, Catherine",
  abstract      = "Machine learning about language can be improved by supplying
                   it with specific knowledge and sources of external
                   information. We present here a new version of the linked
                   open data resource ConceptNet that is particularly well
                   suited to be used with modern NLP techniques such as word
                   embeddings. ConceptNet is a knowledge graph that connects
                   words and phrases of natural language with labeled edges.
                   Its knowledge is collected from many sources that include
                   expert-created resources, crowd-sourcing, and games with a
                   purpose. It is designed to represent the general knowledge
                   involved in understanding language, improving natural
                   language applications by allowing the application to better
                   understand the meanings behind the words people use. When
                   ConceptNet is combined with word embeddings acquired from
                   distributional semantics (such as word2vec), it provides
                   applications with understanding that they would not acquire
                   from distributional semantics alone, nor from narrower
                   resources such as WordNet or DBPedia. We demonstrate this
                   with state-of-the-art results on intrinsic evaluations of
                   word relatedness that translate into improvements on
                   applications of word vectors, including solving SAT-style
                   analogies.",
  month         =  "12~" # dec,
  year          =  2016,
  keywords      = "Latently/Data/OkToUse;Latently;Latently/Data",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.03975"
}

@ARTICLE{Upadhyay2016-ff,
  title         = "{TeKnowbase}: Towards Construction of a Knowledge-base of
                   Technical Concepts",
  author        = "Upadhyay, Prajna and Patra, Tanuma and Purkar, Ashwini and
                   Ramanath, Maya",
  abstract      = "In this paper, we describe the construction of TeKnowbase, a
                   knowledge-base of technical concepts in computer science.
                   Our main information sources are technical websites such as
                   Webopedia and Techtarget as well as Wikipedia and online
                   textbooks. We divide the knowledge-base construction problem
                   into two parts -- the acquisition of entities and the
                   extraction of relationships among these entities. Our
                   knowledge-base consists of approximately 100,000 triples. We
                   conducted an evaluation on a sample of triples and report an
                   accuracy of a little over 90\%. We additionally conducted
                   classification experiments on StackOverflow data with
                   features from TeKnowbase and achieved improved
                   classification accuracy.",
  month         =  "15~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.04988"
}

@ARTICLE{Bhatia2016-qi,
  title         = "Automatic Labelling of Topics with Neural Embeddings",
  author        = "Bhatia, Shraey and Lau, Jey Han and Baldwin, Timothy",
  abstract      = "Topics generated by topic models are typically represented
                   as list of terms. To reduce the cognitive overhead of
                   interpreting these topics for end-users, we propose
                   labelling a topic with a succinct phrase that summarises its
                   theme or idea. Using Wikipedia document titles as label
                   candidates, we compute neural embeddings for documents and
                   words to select the most relevant labels for topics.
                   Compared to a state-of-the-art topic labelling system, our
                   methodology is simpler, more efficient, and finds better
                   topic labels.",
  month         =  "16~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.05340"
}

@ARTICLE{Lin2016-jr,
  title         = "Compositional Learning of Relation Paths Embedding for
                   Knowledge Base Completion",
  author        = "Lin, Xixun and Liang, Yanchun and Guan, Renchu",
  abstract      = "Nowadays, large-scale knowledge bases containing billions of
                   facts have reached impressive sizes; however, they are still
                   far from completion. In addition, most existing methods only
                   consider the direct links between entities, ignoring the
                   vital impact about the semantic of relation paths. In this
                   paper, we study the problem of how to better embed entities
                   and relations into different low dimensional spaces. A
                   compositional learning model of relation paths embedding
                   (RPE) is proposed to take full advantage of additional
                   semantics expressed by relation paths. More specifically,
                   using corresponding projection matrices, RPE can
                   simultaneously embed entities into corresponding relation
                   and path spaces. It is also suggested that type constraints
                   could be extended from traditional relation-specific to the
                   new proposed path-specific ones. Both of the two type
                   constraints can be seamlessly incorporated into RPE and
                   decrease the errors in prediction. Experiments are conducted
                   on the benchmark datasets and the proposed model achieves
                   significant and consistent improvements compared with the
                   state-of-the-art algorithms for knowledge base completion.",
  month         =  "22~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1611.07232"
}

@ARTICLE{Ahn2016-kq,
  title         = "A Neural Knowledge Language Model",
  author        = "Ahn, Sungjin and Choi, Heeyoul and P{\"a}rnamaa, Tanel and
                   Bengio, Yoshua",
  abstract      = "Communicating knowledge is a primary purpose of language.
                   However, current language models have significant
                   limitations in their ability to encode or decode knowledge.
                   This is mainly because they acquire knowledge based on
                   statistical co-occurrences, even if most of the knowledge
                   words are rarely observed named entities. In this paper, we
                   propose a Neural Knowledge Language Model (NKLM) which
                   combines symbolic knowledge provided by knowledge graphs
                   with RNN language models. At each time step, the model
                   predicts a fact on which the observed word is supposed to be
                   based. Then, a word is either generated from the vocabulary
                   or copied from the knowledge graph. We train and test the
                   model on a new dataset, WikiFacts. In experiments, we show
                   that the NKLM significantly improves the perplexity while
                   generating a much smaller number of unknown words. In
                   addition, we demonstrate that the sampled descriptions
                   include named entities which were used to be the unknown
                   words in RNN language models.",
  month         =  "1~" # aug,
  year          =  2016,
  keywords      = "Latently;Latently/Code/WillBeAvail;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1608.00318"
}

@ARTICLE{Wu2016-hd,
  title         = "Google's Neural Machine Translation System: Bridging the Gap
                   between Human and Machine Translation",
  author        = "Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le,
                   Quoc V and Norouzi, Mohammad and Macherey, Wolfgang and
                   Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey, Klaus
                   and Klingner, Jeff and Shah, Apurva and Johnson, Melvin and
                   Liu, Xiaobing and Kaiser, {\L}ukasz and Gouws, Stephan and
                   Kato, Yoshikiyo and Kudo, Taku and Kazawa, Hideto and
                   Stevens, Keith and Kurian, George and Patil, Nishant and
                   Wang, Wei and Young, Cliff and Smith, Jason and Riesa, Jason
                   and Rudnick, Alex and Vinyals, Oriol and Corrado, Greg and
                   Hughes, Macduff and Dean, Jeffrey",
  abstract      = "Neural Machine Translation (NMT) is an end-to-end learning
                   approach for automated translation, with the potential to
                   overcome many of the weaknesses of conventional phrase-based
                   translation systems. Unfortunately, NMT systems are known to
                   be computationally expensive both in training and in
                   translation inference. Also, most NMT systems have
                   difficulty with rare words. These issues have hindered NMT's
                   use in practical deployments and services, where both
                   accuracy and speed are essential. In this work, we present
                   GNMT, Google's Neural Machine Translation system, which
                   attempts to address many of these issues. Our model consists
                   of a deep LSTM network with 8 encoder and 8 decoder layers
                   using attention and residual connections. To improve
                   parallelism and therefore decrease training time, our
                   attention mechanism connects the bottom layer of the decoder
                   to the top layer of the encoder. To accelerate the final
                   translation speed, we employ low-precision arithmetic during
                   inference computations. To improve handling of rare words,
                   we divide words into a limited set of common sub-word units
                   (``wordpieces'') for both input and output. This method
                   provides a good balance between the flexibility of
                   ``character''-delimited models and the efficiency of
                   ``word''-delimited models, naturally handles translation of
                   rare words, and ultimately improves the overall accuracy of
                   the system. Our beam search technique employs a
                   length-normalization procedure and uses a coverage penalty,
                   which encourages generation of an output sentence that is
                   most likely to cover all the words in the source sentence.
                   On the WMT'14 English-to-French and English-to-German
                   benchmarks, GNMT achieves competitive results to
                   state-of-the-art. Using a human side-by-side evaluation on a
                   set of isolated simple sentences, it reduces translation
                   errors by an average of 60\% compared to Google's
                   phrase-based production system.",
  month         =  "26~" # sep,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1609.08144"
}

@ARTICLE{Joulin2016-tc,
  title         = "Bag of Tricks for Efficient Text Classification",
  author        = "Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and
                   Mikolov, Tomas",
  abstract      = "This paper explores a simple and efficient baseline for text
                   classification. Our experiments show that our fast text
                   classifier fastText is often on par with deep learning
                   classifiers in terms of accuracy, and many orders of
                   magnitude faster for training and evaluation. We can train
                   fastText on more than one billion words in less than ten
                   minutes using a standard multicore~CPU, and classify half a
                   million sentences among~312K classes in less than a minute.",
  month         =  "6~" # jul,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1607.01759"
}

@INPROCEEDINGS{Pennington2014-wv,
  title     = "Glove: Global Vectors for Word Representation",
  booktitle = "{EMNLP}",
  author    = "Pennington, Jeffrey and Socher, Richard and Manning, Christopher
               D",
  volume    =  14,
  pages     = "1532--1543",
  year      =  2014,
  keywords  = "Latently/Code/OkToUse;Latently;Latently/Must Read"
}

@ARTICLE{Jozefowicz2016-hw,
  title         = "Exploring the Limits of Language Modeling",
  author        = "Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and
                   Shazeer, Noam and Wu, Yonghui",
  abstract      = "In this work we explore recent advances in Recurrent Neural
                   Networks for large scale Language Modeling, a task central
                   to language understanding. We extend current models to deal
                   with two key challenges present in this task: corpora and
                   vocabulary sizes, and complex, long term structure of
                   language. We perform an exhaustive study on techniques such
                   as character Convolutional Neural Networks or Long-Short
                   Term Memory, on the One Billion Word Benchmark. Our best
                   single model significantly improves state-of-the-art
                   perplexity from 51.3 down to 30.0 (whilst reducing the
                   number of parameters by a factor of 20), while an ensemble
                   of models sets a new record by improving perplexity from
                   41.0 down to 23.7. We also release these models for the NLP
                   and ML community to study and improve upon.",
  month         =  "7~" # feb,
  year          =  2016,
  keywords      = "Latently/Code/OkToUse;Latently;Latently/Code",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1602.02410"
}

@ARTICLE{Das2016-dm,
  title         = "Chains of Reasoning over Entities, Relations, and Text using
                   Recurrent Neural Networks",
  author        = "Das, Rajarshi and Neelakantan, Arvind and Belanger, David
                   and McCallum, Andrew",
  abstract      = "Our goal is to combine the rich multistep inference of
                   symbolic logical reasoning with the generalization
                   capabilities of neural networks. We are particularly
                   interested in complex reasoning about entities and relations
                   in text and large-scale knowledge bases (KBs). Neelakantan
                   et al. (2015) use RNNs to compose the distributed semantics
                   of multi-hop paths in KBs; however for multiple reasons, the
                   approach lacks accuracy and practicality. This paper
                   proposes three significant modeling advances: (1) we learn
                   to jointly reason about relations, entities, and
                   entity-types; (2) we use neural attention modeling to
                   incorporate multiple paths; (3) we learn to share strength
                   in a single RNN that represents logical composition across
                   all relations. On a largescale Freebase+ClueWeb prediction
                   task, we achieve 25\% error reduction, and a 53\% error
                   reduction on sparse relations due to shared strength. On
                   chains of reasoning in WordNet we reduce error in mean
                   quantile by 84\% versus previous state-of-the-art. The code
                   and data are available at
                   https://rajarshd.github.io/ChainsofReasoning",
  month         =  "5~" # jul,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read;Latently/Code",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1607.01426"
}

@ARTICLE{Choi2016-ua,
  title         = "Hierarchical Question Answering for Long Documents",
  author        = "Choi, Eunsol and Hewlett, Daniel and Lacoste, Alexandre and
                   Polosukhin, Illia and Uszkoreit, Jakob and Berant, Jonathan",
  abstract      = "Reading an article and answering questions about its content
                   is a fundamental task for natural language understanding.
                   While most successful neural approaches to this problem rely
                   on recurrent neural networks (RNNs), training RNNs over long
                   documents can be prohibitively slow. We present a novel
                   framework for question answering that can efficiently scale
                   to longer documents while maintaining or even improving
                   performance. Our approach combines a coarse, inexpensive
                   model for selecting one or more relevant sentences and a
                   more expensive RNN that produces the answer from those
                   sentences. A central challenge is the lack of intermediate
                   supervision for the coarse model, which we address using
                   reinforcement learning. Experiments demonstrate
                   state-of-the-art performance on a challenging subset of the
                   WIKIREADING dataset(Hewlett et al., 2016) and on a
                   newly-gathered dataset, while reducing the number of
                   sequential RNN steps by 88\% against a standard sequence to
                   sequence model.",
  month         =  "6~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1611.01839"
}

@ARTICLE{Hewlett2016-mq,
  title         = "{WikiReading}: A Novel Large-scale Language Understanding
                   Task over Wikipedia",
  author        = "Hewlett, Daniel and Lacoste, Alexandre and Jones, Llion and
                   Polosukhin, Illia and Fandrianto, Andrew and Han, Jay and
                   Kelcey, Matthew and Berthelot, David",
  abstract      = "We present WikiReading, a large-scale natural language
                   understanding task and publicly-available dataset with 18
                   million instances. The task is to predict textual values
                   from the structured knowledge base Wikidata by reading the
                   text of the corresponding Wikipedia articles. The task
                   contains a rich variety of challenging classification and
                   extraction sub-tasks, making it well-suited for end-to-end
                   models such as deep neural networks (DNNs). We compare
                   various state-of-the-art DNN-based architectures for
                   document classification, information extraction, and
                   question answering. We find that models supporting a rich
                   answer space, such as word or character sequences, perform
                   best. Our best-performing model, a word-level sequence to
                   sequence model with a mechanism to copy out-of-vocabulary
                   words, obtains an accuracy of 71.8\%.",
  month         =  "11~" # aug,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read/Essentials;Latently/Must
                   Read;Latently/Code",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1608.03542"
}

@ARTICLE{Grave2016-vw,
  title         = "Improving Neural Language Models with a Continuous Cache",
  author        = "Grave, Edouard and Joulin, Armand and Usunier, Nicolas",
  abstract      = "We propose an extension to neural network language models to
                   adapt their prediction to the recent history. Our model is a
                   simplified version of memory augmented networks, which
                   stores past hidden activations as memory and accesses them
                   through a dot product with the current hidden activation.
                   This mechanism is very efficient and scales to very large
                   memory sizes. We also draw a link between the use of
                   external memory in neural network and cache models used with
                   count based language models. We demonstrate on several
                   language model datasets that our approach performs
                   significantly better than recent memory augmented networks.",
  month         =  "13~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.04426"
}

@ARTICLE{Heinrich2016-vr,
  title         = "Deep Reinforcement Learning from {Self-Play} in
                   {Imperfect-Information} Games",
  author        = "Heinrich, Johannes and Silver, David",
  abstract      = "Many real-world applications can be described as large-scale
                   games of imperfect information. To deal with these
                   challenging domains, prior work has focused on computing
                   Nash equilibria in a handcrafted abstraction of the domain.
                   In this paper we introduce the first scalable end-to-end
                   approach to learning approximate Nash equilibria without
                   prior domain knowledge. Our method combines fictitious
                   self-play with deep reinforcement learning. When applied to
                   Leduc poker, Neural Fictitious Self-Play (NFSP) approached a
                   Nash equilibrium, whereas common reinforcement learning
                   methods diverged. In Limit Texas Holdem, a poker game of
                   real-world scale, NFSP learnt a strategy that approached the
                   performance of state-of-the-art, superhuman algorithms based
                   on significant domain expertise.",
  month         =  "3~" # mar,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1603.01121"
}

@ARTICLE{Balduzzi2015-lx,
  title         = "Semantics, Representations and Grammars for Deep Learning",
  author        = "Balduzzi, David",
  abstract      = "Deep learning is currently the subject of intensive study.
                   However, fundamental concepts such as representations are
                   not formally defined -- researchers ``know them when they
                   see them'' -- and there is no common language for describing
                   and analyzing algorithms. This essay proposes an abstract
                   framework that identifies the essential features of current
                   practice and may provide a foundation for future
                   developments. The backbone of almost all deep learning
                   algorithms is backpropagation, which is simply a gradient
                   computation distributed over a neural network. The main
                   ingredients of the framework are thus, unsurprisingly: (i)
                   game theory, to formalize distributed optimization; and (ii)
                   communication protocols, to track the flow of zeroth and
                   first-order information. The framework allows natural
                   definitions of semantics (as the meaning encoded in
                   functions), representations (as functions whose semantics is
                   chosen to optimized a criterion) and grammars (as
                   communication protocols equipped with first-order
                   convergence guarantees). Much of the essay is spent
                   discussing examples taken from the literature. The ultimate
                   aim is to develop a graphical language for describing the
                   structure of deep learning algorithms that backgrounds the
                   details of the optimization procedure and foregrounds how
                   the components interact. Inspiration is taken from
                   probabilistic graphical models and factor graphs, which
                   capture the essential structural features of multivariate
                   distributions.",
  month         =  "29~" # sep,
  year          =  2015,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1509.08627"
}

@ARTICLE{Ramachandran2016-ja,
  title         = "Unsupervised Pretraining for Sequence to Sequence Learning",
  author        = "Ramachandran, Prajit and Liu, Peter J and Le, Quoc V",
  abstract      = "Sequence to sequence models are successful tools for
                   supervised sequence learning tasks, such as machine
                   translation. Despite their success, these models still
                   require much labeled data and it is unclear how to improve
                   them using unlabeled data, which is much less expensive to
                   obtain. In this paper, we present simple changes that lead
                   to a significant improvement in the accuracy of seq2seq
                   models when the labeled set is small. Our method intializes
                   the encoder and decoder of the seq2seq model with the
                   trained weights of two language models, and then all weights
                   are jointly fine-tuned with labeled data. An additional
                   language modeling loss can be used to regularize the model
                   during fine-tuning. We apply this method to low-resource
                   tasks in machine translation and abstractive summarization
                   and find that it significantly improves the subsequent
                   supervised models. Our main finding is that the pretraining
                   accelerates training and improves generalization of seq2seq
                   models, achieving state-of-the-art results on the WMT
                   English$\rightarrow$German task. Our model obtains an
                   improvement of 1.3 BLEU from the previous best models on
                   both WMT'14 and WMT'15 English$\rightarrow$German. Our
                   ablation study shows that pretraining helps seq2seq models
                   in different ways depending on the nature of the task:
                   translation benefits from the improved generalization
                   whereas summarization benefits from the improved
                   optimization.",
  month         =  "8~" # nov,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1611.02683"
}

@ARTICLE{Miyato2016-lz,
  title         = "Adversarial Training Methods for {Semi-Supervised} Text
                   Classification",
  author        = "Miyato, Takeru and Dai, Andrew M and Goodfellow, Ian",
  abstract      = "Adversarial training provides a means of regularizing
                   supervised learning algorithms while virtual adversarial
                   training is able to extend supervised learning algorithms to
                   the semi-supervised setting. However, both methods require
                   making small perturbations to numerous entries of the input
                   vector, which is inappropriate for sparse high-dimensional
                   inputs such as one-hot word representations. We extend
                   adversarial and virtual adversarial training to the text
                   domain by applying perturbations to the word embeddings in a
                   recurrent neural network rather than to the original input
                   itself. The proposed method achieves state of the art
                   results on multiple benchmark semi-supervised and purely
                   supervised tasks. We provide visualizations and analysis
                   showing that the learned word embeddings have improved in
                   quality and that while training, the model is less prone to
                   overfitting.",
  month         =  "25~" # may,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1605.07725"
}

@INPROCEEDINGS{Bansal2016-kq,
  title     = "Ask the {GRU}: Multi-task Learning for Deep Text Recommendations",
  booktitle = "Proceedings of the 10th {ACM} Conference on Recommender Systems",
  author    = "Bansal, Trapit and Belanger, David and McCallum, Andrew",
  publisher = "ACM",
  pages     = "107--114",
  series    = "RecSys '16",
  year      =  2016,
  address   = "New York, NY, USA",
  keywords  = "cold start, deep learning, multi-task learning, neural networks,
               recommender systems;Latently"
}

@ARTICLE{Strobelt2016-nf,
  title         = "Visual Analysis of Hidden State Dynamics in Recurrent Neural
                   Networks",
  author        = "Strobelt, Hendrik and Gehrmann, Sebastian and Huber, Bernd
                   and Pfister, Hanspeter and Rush, Alexander M",
  abstract      = "Recurrent neural networks, and in particular long short-term
                   memory networks (LSTMs), are a remarkably effective tool for
                   sequence modeling that learn a dense black-box hidden
                   representation of their sequential input. Researchers
                   interested in better understanding these models have studied
                   the changes in hidden state representations over time and
                   noticed some interpretable patterns but also significant
                   noise. In this work, we present LSTMVis a visual analysis
                   tool for recurrent neural networks with a focus on
                   understanding these hidden state dynamics. The tool allows a
                   user to select a hypothesis input range to focus on local
                   state changes, to match these states changes to similar
                   patterns in a large data set, and to align these results
                   with domain specific structural annotations. We further show
                   several use cases of the tool for analyzing specific hidden
                   state properties on datasets containing nesting, phrase
                   structure, and chord progressions, and demonstrate how the
                   tool can be used to isolate patterns for further statistical
                   analysis.",
  month         =  "23~" # jun,
  year          =  2016,
  keywords      = "Latently;Latently/Code",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.07461"
}

@ARTICLE{Strobelt2016-qg,
  title         = "Visual Analysis of Hidden State Dynamics in Recurrent Neural
                   Networks",
  author        = "Strobelt, Hendrik and Gehrmann, Sebastian and Huber, Bernd
                   and Pfister, Hanspeter and Rush, Alexander M",
  abstract      = "Recurrent neural networks, and in particular long short-term
                   memory networks (LSTMs), are a remarkably effective tool for
                   sequence modeling that learn a dense black-box hidden
                   representation of their sequential input. Researchers
                   interested in better understanding these models have studied
                   the changes in hidden state representations over time and
                   noticed some interpretable patterns but also significant
                   noise. In this work, we present LSTMVis a visual analysis
                   tool for recurrent neural networks with a focus on
                   understanding these hidden state dynamics. The tool allows a
                   user to select a hypothesis input range to focus on local
                   state changes, to match these states changes to similar
                   patterns in a large data set, and to align these results
                   with domain specific structural annotations. We further show
                   several use cases of the tool for analyzing specific hidden
                   state properties on datasets containing nesting, phrase
                   structure, and chord progressions, and demonstrate how the
                   tool can be used to isolate patterns for further statistical
                   analysis.",
  month         =  "23~" # jun,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.07461"
}

@INPROCEEDINGS{Krause2016-iq,
  title     = "Interacting with Predictions: Visual Inspection of Black-box
               Machine Learning Models",
  booktitle = "Proceedings of the 2016 {CHI} Conference on Human Factors in
               Computing Systems",
  author    = "Krause, Josua and Perer, Adam and Ng, Kenney",
  publisher = "ACM",
  pages     = "5686--5697",
  series    = "CHI '16",
  year      =  2016,
  address   = "New York, NY, USA",
  keywords  = "interactive machine learning, partial dependence, predictive
               modeling;Latently"
}

@ARTICLE{Adi2016-eb,
  title         = "Fine-grained Analysis of Sentence Embeddings Using Auxiliary
                   Prediction Tasks",
  author        = "Adi, Yossi and Kermany, Einat and Belinkov, Yonatan and
                   Lavi, Ofer and Goldberg, Yoav",
  abstract      = "There is a lot of research interest in encoding variable
                   length sentences into fixed length vectors, in a way that
                   preserves the sentence meanings. Two common methods include
                   representations based on averaging word vectors, and
                   representations based on the hidden states of recurrent
                   neural networks such as LSTMs. The sentence vectors are used
                   as features for subsequent machine learning tasks or for
                   pre-training in the context of deep learning. However, not
                   much is known about the properties that are encoded in these
                   sentence representations and about the language information
                   they capture. We propose a framework that facilitates better
                   understanding of the encoded representations. We define
                   prediction tasks around isolated aspects of sentence
                   structure (namely sentence length, word content, and word
                   order), and score representations by the ability to train a
                   classifier to solve each prediction task when using the
                   representation as input. We demonstrate the potential
                   contribution of the approach by analyzing different sentence
                   representation mechanisms. The analysis sheds light on the
                   relative strengths of different sentence embedding methods
                   with respect to these low level prediction tasks, and on the
                   effect of the encoded vector's dimensionality on the
                   resulting representations.",
  month         =  "15~" # aug,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1608.04207"
}

@ARTICLE{Dieng2016-bq,
  title         = "{TopicRNN}: A Recurrent Neural Network with {Long-Range}
                   Semantic Dependency",
  author        = "Dieng, Adji B and Wang, Chong and Gao, Jianfeng and Paisley,
                   John",
  abstract      = "In this paper, we propose TopicRNN, a recurrent neural
                   network (RNN)-based language model designed to directly
                   capture the global semantic meaning relating words in a
                   document via latent topics. Because of their sequential
                   nature, RNNs are good at capturing the local structure of a
                   word sequence - both semantic and syntactic - but might face
                   difficulty remembering long-range dependencies. Intuitively,
                   these long-range dependencies are of semantic nature. In
                   contrast, latent topic models are able to capture the global
                   underlying semantic structure of a document but do not
                   account for word ordering. The proposed TopicRNN model
                   integrates the merits of RNNs and latent topic models: it
                   captures local (syntactic) dependencies using an RNN and
                   global (semantic) dependencies using latent topics. Unlike
                   previous work on contextual RNN language modeling, our model
                   is learned end-to-end. Empirical results on word prediction
                   show that TopicRNN outperforms existing contextual RNN
                   baselines. In addition, TopicRNN can be used as an
                   unsupervised feature extractor for documents. We do this for
                   sentiment analysis and report a new state-of-the-art error
                   rate on the IMDB movie review dataset that amounts to a
                   $13.3\%$ improvement over the previous best result. Finally
                   TopicRNN also yields sensible topics, making it a useful
                   alternative to document models such as latent Dirichlet
                   allocation.",
  month         =  "5~" # nov,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read/Essentials;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1611.01702"
}

@ARTICLE{Dieng2016-mt,
  title         = "{TopicRNN}: A Recurrent Neural Network with {Long-Range}
                   Semantic Dependency",
  author        = "Dieng, Adji B and Wang, Chong and Gao, Jianfeng and Paisley,
                   John",
  abstract      = "In this paper, we propose TopicRNN, a recurrent neural
                   network (RNN)-based language model designed to directly
                   capture the global semantic meaning relating words in a
                   document via latent topics. Because of their sequential
                   nature, RNNs are good at capturing the local structure of a
                   word sequence - both semantic and syntactic - but might face
                   difficulty remembering long-range dependencies. Intuitively,
                   these long-range dependencies are of semantic nature. In
                   contrast, latent topic models are able to capture the global
                   underlying semantic structure of a document but do not
                   account for word ordering. The proposed TopicRNN model
                   integrates the merits of RNNs and latent topic models: it
                   captures local (syntactic) dependencies using an RNN and
                   global (semantic) dependencies using latent topics. Unlike
                   previous work on contextual RNN language modeling, our model
                   is learned end-to-end. Empirical results on word prediction
                   show that TopicRNN outperforms existing contextual RNN
                   baselines. In addition, TopicRNN can be used as an
                   unsupervised feature extractor for documents. We do this for
                   sentiment analysis and report a new state-of-the-art error
                   rate on the IMDB movie review dataset that amounts to a
                   $13.3\%$ improvement over the previous best result. Finally
                   TopicRNN also yields sensible topics, making it a useful
                   alternative to document models such as latent Dirichlet
                   allocation.",
  month         =  "5~" # nov,
  year          =  2016,
  keywords      = "Latently;Latently/Code/WillBeAvail",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1611.01702"
}

@ARTICLE{Saha2016-ru,
  title         = "{Dis-S2V}: Discourse Informed {Sen2Vec}",
  author        = "Saha, Tanay Kumar and Joty, Shafiq and Hassan, Naeemul and
                   Al Hasan, Mohammad",
  abstract      = "Vector representation of sentences is important for many
                   text processing tasks that involve clustering, classifying,
                   or ranking sentences. Recently, distributed representation
                   of sentences learned by neural models from unlabeled data
                   has been shown to outperform the traditional bag-of-words
                   representation. However, most of these learning methods
                   consider only the content of a sentence and disregard the
                   relations among sentences in a discourse by and large. In
                   this paper, we propose a series of novel models for learning
                   latent representations of sentences (Sen2Vec) that consider
                   the content of a sentence as well as inter-sentence
                   relations. We first represent the inter-sentence relations
                   with a language network and then use the network to induce
                   contextual information into the content-based Sen2Vec
                   models. Two different approaches are introduced to exploit
                   the information in the network. Our first approach retrofits
                   (already trained) Sen2Vec vectors with respect to the
                   network in two different ways: (1) using the adjacency
                   relations of a node, and (2) using a stochastic sampling
                   method which is more flexible in sampling neighbors of a
                   node. The second approach uses a regularizer to encode the
                   information in the network into the existing Sen2Vec model.
                   Experimental results show that our proposed models
                   outperform existing methods in three fundamental information
                   system tasks demonstrating the effectiveness of our
                   approach. The models leverage the computational power of
                   multi-core CPUs to achieve fine-grained computational
                   efficiency. We make our code publicly available upon
                   acceptance.",
  month         =  "25~" # oct,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1610.08078"
}

@ARTICLE{noauthor_undated-go,
  title    = "W16-25.pdf\#page=134",
  keywords = "Latently"
}

@ARTICLE{Hill2016-zp,
  title         = "Learning Distributed Representations of Sentences from
                   Unlabelled Data",
  author        = "Hill, Felix and Cho, Kyunghyun and Korhonen, Anna",
  abstract      = "Unsupervised methods for learning distributed
                   representations of words are ubiquitous in today's NLP
                   research, but far less is known about the best ways to learn
                   distributed phrase or sentence representations from
                   unlabelled data. This paper is a systematic comparison of
                   models that learn such representations. We find that the
                   optimal approach depends critically on the intended
                   application. Deeper, more complex models are preferable for
                   representations to be used in supervised systems, but
                   shallow log-linear models work best for building
                   representation spaces that can be decoded with simple
                   spatial distance metrics. We also propose two new
                   unsupervised representation-learning objectives designed to
                   optimise the trade-off between training time, domain
                   portability and performance.",
  month         =  "10~" # feb,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1602.03483"
}

@ARTICLE{Cheng2016-lt,
  title         = "{Semi-Supervised} Learning for Neural Machine Translation",
  author        = "Cheng, Yong and Xu, Wei and He, Zhongjun and He, Wei and Wu,
                   Hua and Sun, Maosong and Liu, Yang",
  abstract      = "While end-to-end neural machine translation (NMT) has made
                   remarkable progress recently, NMT systems only rely on
                   parallel corpora for parameter estimation. Since parallel
                   corpora are usually limited in quantity, quality, and
                   coverage, especially for low-resource languages, it is
                   appealing to exploit monolingual corpora to improve NMT. We
                   propose a semi-supervised approach for training NMT models
                   on the concatenation of labeled (parallel corpora) and
                   unlabeled (monolingual corpora) data. The central idea is to
                   reconstruct the monolingual corpora using an autoencoder, in
                   which the source-to-target and target-to-source translation
                   models serve as the encoder and decoder, respectively. Our
                   approach can not only exploit the monolingual corpora of the
                   target language, but also of the source language.
                   Experiments on the Chinese-English dataset show that our
                   approach achieves significant improvements over
                   state-of-the-art SMT and NMT systems.",
  month         =  "15~" # jun,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read/Essentials;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1606.04596"
}

@INCOLLECTION{Dai2015-ra,
  title     = "Semi-supervised Sequence Learning",
  booktitle = "Advances in Neural Information Processing Systems 28",
  author    = "Dai, Andrew M and Le, Quoc V",
  editor    = "Cortes, C and Lawrence, N D and Lee, D D and Sugiyama, M and
               Garnett, R",
  publisher = "Curran Associates, Inc.",
  pages     = "3079--3087",
  year      =  2015,
  keywords  = "Latently"
}

@INPROCEEDINGS{Le2014-zb,
  title     = "Distributed Representations of Sentences and Documents",
  booktitle = "Proceedings of The 31st International Conference on Machine
               Learning",
  author    = "Le, Quoc and Mikolov, Tomas",
  abstract  = "Many machine learning algorithms require the input to be
               represented as a fixed length feature vector. When it comes to
               texts, one of the most common representations is bag-of-words.
               Despite their popularity, bag-of-words models have two major
               weaknesses: they lose the ordering of the words and they also
               ignore semantics of the words. For example, ``powerful,''
               ``strong'' and ``Paris'' are equally distant. In this paper, we
               propose an unsupervised algorithm that learns vector
               representations of sentences and text documents. This algorithm
               represents each document by a dense vector which is trained to
               predict words in the document. Its construction gives our
               algorithm the potential to overcome the weaknesses of
               bag-of-words models. Empirical results show that our technique
               outperforms bag-of-words models as well as other techniques for
               text representations. Finally, we achieve new state-of-the-art
               results on several text classification and sentiment analysis
               tasks.",
  pages     = "1188--1196",
  year      =  2014,
  keywords  = "Latently;Latently/Code"
}

@ARTICLE{Collobert2011-ie,
  title    = "Natural Language Processing (Almost) from Scratch",
  author   = "Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on and
              Karlen, Michael and Kavukcuoglu, Koray and Kuksa, Pavel",
  journal  = "J. Mach. Learn. Res.",
  volume   =  12,
  number   = "Aug",
  pages    = "2493--2537",
  year     =  2011,
  keywords = "Latently;Latently/Code/DoNotUse;Latently/Must Read"
}

@ARTICLE{Rosenthal1979-ko,
  title     = "The file drawer problem and tolerance for null results",
  author    = "Rosenthal, Robert",
  abstract  = "For any given research area, one cannot tell how many studies
               have been conducted but never reported. The extreme view of the
               ``file drawer problem'' is that journals are filled with the 5\%
               of the studies that show Type I errors, while the file drawers
               are filled with the 95\% of the studies that show nonsignificant
               results. Quantitative procedures for computing the tolerance for
               filed and future null results are reported and illustrated, and
               the implications are discussed. (15 ref) (PsycINFO Database
               Record (c) 2016 APA, all rights reserved)",
  journal   = "Psychol. Bull.",
  publisher = "American Psychological Association",
  volume    =  86,
  number    =  3,
  pages     = "638",
  month     =  may,
  year      =  1979,
  keywords  = "tolerance for null results bias in publication of research with
               Type I errors \& estimation of unpublished domain;Latently",
  language  = "en"
}

@ARTICLE{Perozzi2014-cd,
  title         = "{DeepWalk}: Online Learning of Social Representations",
  author        = "Perozzi, Bryan and Al-Rfou, Rami and Skiena, Steven",
  abstract      = "We present DeepWalk, a novel approach for learning latent
                   representations of vertices in a network. These latent
                   representations encode social relations in a continuous
                   vector space, which is easily exploited by statistical
                   models. DeepWalk generalizes recent advancements in language
                   modeling and unsupervised feature learning (or deep
                   learning) from sequences of words to graphs. DeepWalk uses
                   local information obtained from truncated random walks to
                   learn latent representations by treating walks as the
                   equivalent of sentences. We demonstrate DeepWalk's latent
                   representations on several multi-label network
                   classification tasks for social networks such as
                   BlogCatalog, Flickr, and YouTube. Our results show that
                   DeepWalk outperforms challenging baselines which are allowed
                   a global view of the network, especially in the presence of
                   missing information. DeepWalk's representations can provide
                   $F_1$ scores up to 10\% higher than competing methods when
                   labeled data is sparse. In some experiments, DeepWalk's
                   representations are able to outperform all baseline methods
                   while using 60\% less training data. DeepWalk is also
                   scalable. It is an online learning algorithm which builds
                   useful incremental results, and is trivially parallelizable.
                   These qualities make it suitable for a broad class of real
                   world applications such as network classification, and
                   anomaly detection.",
  month         =  "26~" # mar,
  year          =  2014,
  keywords      = "Latently;Latently/Code/DoNotUse",
  archivePrefix = "arXiv",
  primaryClass  = "cs.SI",
  eprint        = "1403.6652"
}

@ARTICLE{Alonso2016-oz,
  title         = "When is multitask learning effective? Semantic sequence
                   prediction under varying data conditions",
  author        = "Alonso, H{\'e}ctor Mart{\'\i}nez and Plank, Barbara",
  abstract      = "Multitask learning has been applied successfully to a range
                   of tasks, mostly morphosyntactic. However, little is known
                   on when MTL works and whether there are data characteristics
                   that help to determine its success. In this paper we
                   evaluate a range of semantic sequence labeling tasks in a
                   MTL setup. We examine different auxiliary tasks, amongst
                   which a novel setup, and correlate their impact to
                   data-dependent conditions. Our results show that MTL is not
                   always effective, significant improvements are obtained only
                   for 1 out of 5 tasks. When successful, auxiliary tasks with
                   compact and more uniform label distributions are preferable.",
  month         =  "7~" # dec,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read/Essentials;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.02251"
}

@ARTICLE{Luong2015-xs,
  title         = "Multi-task Sequence to Sequence Learning",
  author        = "Luong, Minh-Thang and Le, Quoc V and Sutskever, Ilya and
                   Vinyals, Oriol and Kaiser, Lukasz",
  abstract      = "Sequence to sequence learning has recently emerged as a new
                   paradigm in supervised learning. To date, most of its
                   applications focused on only one task and not much work
                   explored this framework for multiple tasks. This paper
                   examines three multi-task learning (MTL) settings for
                   sequence to sequence models: (a) the oneto-many setting -
                   where the encoder is shared between several tasks such as
                   machine translation and syntactic parsing, (b) the
                   many-to-one setting - useful when only the decoder can be
                   shared, as in the case of translation and image caption
                   generation, and (c) the many-to-many setting - where
                   multiple encoders and decoders are shared, which is the case
                   with unsupervised objectives and translation. Our results
                   show that training on a small amount of parsing and image
                   caption data can improve the translation quality between
                   English and German by up to 1.5 BLEU points over strong
                   single-task baselines on the WMT benchmarks. Furthermore, we
                   have established a new state-of-the-art result in
                   constituent parsing with 93.0 F1. Lastly, we reveal
                   interesting properties of the two unsupervised learning
                   objectives, autoencoder and skip-thought, in the MTL
                   context: autoencoder helps less in terms of perplexities but
                   more on BLEU scores compared to skip-thought.",
  month         =  "19~" # nov,
  year          =  2015,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1511.06114"
}

@INCOLLECTION{Kiros2015-pr,
  title     = "{Skip-Thought} Vectors",
  booktitle = "Advances in Neural Information Processing Systems 28",
  author    = "Kiros, Ryan and Zhu, Yukun and Salakhutdinov, Ruslan R and
               Zemel, Richard and Urtasun, Raquel and Torralba, Antonio and
               Fidler, Sanja",
  editor    = "Cortes, C and Lawrence, N D and Lee, D D and Sugiyama, M and
               Garnett, R",
  publisher = "Curran Associates, Inc.",
  pages     = "3294--3302",
  year      =  2015,
  keywords  = "Latently/Code/OkToUse;Latently;Latently/Must
               Read/Essentials;Latently/Code;Latently/Must Read"
}

@INCOLLECTION{Sutskever2014-ke,
  title     = "Sequence to Sequence Learning with Neural Networks",
  booktitle = "Advances in Neural Information Processing Systems 27",
  author    = "Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V",
  editor    = "Ghahramani, Z and Welling, M and Cortes, C and Lawrence, N D and
               Weinberger, K Q",
  publisher = "Curran Associates, Inc.",
  pages     = "3104--3112",
  year      =  2014,
  keywords  = "Latently;Latently/Code/DoNotUse"
}

@ARTICLE{Li2016-vj,
  title         = "Understanding Neural Networks through Representation Erasure",
  author        = "Li, Jiwei and Monroe, Will and Jurafsky, Dan",
  abstract      = "While neural networks have been successfully applied to many
                   natural language processing tasks, they come at the cost of
                   interpretability. In this paper, we propose a general
                   methodology to analyze and interpret decisions from a neural
                   model by observing the effects on the model of erasing
                   various parts of the representation, such as input
                   word-vector dimensions, intermediate hidden units, or input
                   words. We present several approaches to analyzing the
                   effects of such erasure, from computing the relative
                   difference in evaluation metrics, to using reinforcement
                   learning to erase the minimum set of input words in order to
                   flip a neural model's decision. In a comprehensive analysis
                   of multiple NLP tasks, including linguistic feature
                   classification, sentence-level sentiment analysis, and
                   document level sentiment aspect prediction, we show that the
                   proposed methodology not only offers clear explanations
                   about neural model decisions, but also provides a way to
                   conduct error analysis on neural models.",
  month         =  "24~" # dec,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1612.08220"
}

@ARTICLE{Shazeer_undated-hv,
  title    = "{OUTRAGEOUSLY} {LARGE} {NEURAL} {NETWORKS}: {THE}
              {SPARSELY-GATED} {MIXTURE-OF-EXPERTS} {LAYER}",
  author   = "Shazeer, Noam and Mirhoseiniy, Azalia and Maziarz, Krzysztof and
              Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff",
  keywords = "Latently;Latently/Must Read/Essentials;Latently/Must Read"
}

@ARTICLE{Mikolov2014-iw,
  title         = "Learning Longer Memory in Recurrent Neural Networks",
  author        = "Mikolov, Tomas and Joulin, Armand and Chopra, Sumit and
                   Mathieu, Michael and Ranzato, Marc'aurelio",
  abstract      = "Recurrent neural network is a powerful model that learns
                   temporal patterns in sequential data. For a long time, it
                   was believed that recurrent networks are difficult to train
                   using simple optimizers, such as stochastic gradient
                   descent, due to the so-called vanishing gradient problem. In
                   this paper, we show that learning longer term patterns in
                   real data, such as in natural language, is perfectly
                   possible using gradient descent. This is achieved by using a
                   slight structural modification of the simple recurrent
                   neural network architecture. We encourage some of the hidden
                   units to change their state slowly by making part of the
                   recurrent weight matrix close to identity, thus forming kind
                   of a longer term memory. We evaluate our model in language
                   modeling experiments, where we obtain similar performance to
                   the much more complex Long Short Term Memory (LSTM) networks
                   (Hochreiter \& Schmidhuber, 1997).",
  month         =  "24~" # dec,
  year          =  2014,
  keywords      = "Latently/Code/OkToUse;Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1412.7753"
}

@ARTICLE{Zaremba2014-ap,
  title         = "Recurrent Neural Network Regularization",
  author        = "Zaremba, Wojciech and Sutskever, Ilya and Vinyals, Oriol",
  abstract      = "We present a simple regularization technique for Recurrent
                   Neural Networks (RNNs) with Long Short-Term Memory (LSTM)
                   units. Dropout, the most successful technique for
                   regularizing neural networks, does not work well with RNNs
                   and LSTMs. In this paper, we show how to correctly apply
                   dropout to LSTMs, and show that it substantially reduces
                   overfitting on a variety of tasks. These tasks include
                   language modeling, speech recognition, image caption
                   generation, and machine translation.",
  month         =  "8~" # sep,
  year          =  2014,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1409.2329"
}

@ARTICLE{Lu2017-gw,
  title         = "Simplified Gating in Long Short-term Memory ({LSTM})
                   Recurrent Neural Networks",
  author        = "Lu, Yuzhen and Salem, Fathi M",
  abstract      = "The standard LSTM recurrent neural networks while very
                   powerful in long-range dependency sequence applications have
                   highly complex structure and relatively large (adaptive)
                   parameters. In this work, we present empirical comparison
                   between the standard LSTM recurrent neural network
                   architecture and three new parameter-reduced variants
                   obtained by eliminating combinations of the input signal,
                   bias, and hidden unit signals from individual gating
                   signals. The experiments on two sequence datasets show that
                   the three new variants, called simply as LSTM1, LSTM2, and
                   LSTM3, can achieve comparable performance to the standard
                   LSTM model with less (adaptive) parameters.",
  month         =  "12~" # jan,
  year          =  2017,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1701.03441"
}

@ARTICLE{Pulver2016-go,
  title         = "{LSTM} with Working Memory",
  author        = "Pulver, Andrew and Lyu, Siwei",
  abstract      = "LSTM is arguably the most successful RNN architecture for
                   many tasks that involve sequential information. In the past
                   few years there have been several proposed improvements to
                   LSTM. We propose an improvement to LSTM which allows
                   communication between memory cells in different blocks and
                   allows an LSTM layer to carry out internal computation
                   within its memory.",
  month         =  "6~" # may,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1605.01988"
}

@ARTICLE{Graves2016-ux,
  title         = "Adaptive Computation Time for Recurrent Neural Networks",
  author        = "Graves, Alex",
  abstract      = "This paper introduces Adaptive Computation Time (ACT), an
                   algorithm that allows recurrent neural networks to learn how
                   many computational steps to take between receiving an input
                   and emitting an output. ACT requires minimal changes to the
                   network architecture, is deterministic and differentiable,
                   and does not add any noise to the parameter gradients.
                   Experimental results are provided for four synthetic
                   problems: determining the parity of binary vectors, applying
                   binary logic operations, adding integers, and sorting real
                   numbers. Overall, performance is dramatically improved by
                   the use of ACT, which successfully adapts the number of
                   computational steps to the requirements of the problem. We
                   also present character-level language modelling results on
                   the Hutter prize Wikipedia dataset. In this case ACT does
                   not yield large gains in performance; however it does
                   provide intriguing insight into the structure of the data,
                   with more computation allocated to harder-to-predict
                   transitions, such as spaces between words and ends of
                   sentences. This suggests that ACT or other adaptive
                   computation methods could provide a generic method for
                   inferring segment boundaries in sequence data.",
  month         =  "29~" # mar,
  year          =  2016,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1603.08983"
}

@ARTICLE{Kim2015-ws,
  title         = "{Character-Aware} Neural Language Models",
  author        = "Kim, Yoon and Jernite, Yacine and Sontag, David and Rush,
                   Alexander M",
  abstract      = "We describe a simple neural language model that relies only
                   on character-level inputs. Predictions are still made at the
                   word-level. Our model employs a convolutional neural network
                   (CNN) and a highway network over characters, whose output is
                   given to a long short-term memory (LSTM) recurrent neural
                   network language model (RNN-LM). On the English Penn
                   Treebank the model is on par with the existing
                   state-of-the-art despite having 60\% fewer parameters. On
                   languages with rich morphology (Arabic, Czech, French,
                   German, Spanish, Russian), the model outperforms
                   word-level/morpheme-level LSTM baselines, again with fewer
                   parameters. The results suggest that on many languages,
                   character inputs are sufficient for language modeling.
                   Analysis of word representations obtained from the character
                   composition part of the model reveals that the model is able
                   to encode, from characters only, both semantic and
                   orthographic information.",
  month         =  "26~" # aug,
  year          =  2015,
  keywords      = "Latently/Code/OkToUse;Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.CL",
  eprint        = "1508.06615"
}

@ARTICLE{Zilly2016-pz,
  title         = "Recurrent Highway Networks",
  author        = "Zilly, Julian Georg and Srivastava, Rupesh Kumar and
                   Koutn{\'\i}k, Jan and Schmidhuber, J{\"u}rgen",
  abstract      = "Many sequential processing tasks require complex nonlinear
                   transition functions from one step to the next. However,
                   recurrent neural networks with 'deep' transition functions
                   remain difficult to train, even when using Long Short-Term
                   Memory (LSTM) networks. We introduce a novel theoretical
                   analysis of recurrent networks based on Gersgorin's circle
                   theorem that illuminates several modeling and optimization
                   issues and improves our understanding of the LSTM cell.
                   Based on this analysis we propose Recurrent Highway
                   Networks, which are deep not only in time but also in space,
                   extending the LSTM architecture to larger step-to-step
                   transition depths. Experiments demonstrate that the proposed
                   architecture results in powerful and efficient models
                   benefiting from up to 10 layers in the recurrent transition.
                   On the Penn Treebank language modeling corpus, a single
                   network outperforms all previous ensemble results with a
                   perplexity of 66.0 on the test set. On the larger Hutter
                   Prize Wikipedia dataset, a single network again
                   significantly outperforms all previous results with an
                   entropy of 1.32 bits per character on the test set.",
  month         =  "12~" # jul,
  year          =  2016,
  keywords      = "Latently;Latently/Must Read",
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1607.03474"
}

@ARTICLE{Graves2013-tj,
  title         = "Generating Sequences With Recurrent Neural Networks",
  author        = "Graves, Alex",
  abstract      = "This paper shows how Long Short-term Memory recurrent neural
                   networks can be used to generate complex sequences with
                   long-range structure, simply by predicting one data point at
                   a time. The approach is demonstrated for text (where the
                   data are discrete) and online handwriting (where the data
                   are real-valued). It is then extended to handwriting
                   synthesis by allowing the network to condition its
                   predictions on a text sequence. The resulting system is able
                   to generate highly realistic cursive handwriting in a wide
                   variety of styles.",
  month         =  "4~" # aug,
  year          =  2013,
  keywords      = "Latently",
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1308.0850"
}

@ARTICLE{Dwork2015-lf,
  title       = "{STATISTICS}. The reusable holdout: Preserving validity in
                 adaptive data analysis",
  author      = "Dwork, Cynthia and Feldman, Vitaly and Hardt, Moritz and
                 Pitassi, Toniann and Reingold, Omer and Roth, Aaron",
  affiliation = "Microsoft Research, Mountain View, CA 94043, USA.
                 dwork@microsoft.com vitaly@post.harvard.edu m@mrtz.org
                 toni@cs.toronto.edu omer.reingold@gmail.com
                 aaroth@cis.upenn.edu. IBM Almaden Research Center, San Jose,
                 CA 95120, USA. dwork@microsoft.com vitaly@post.harvard.edu
                 m@mrtz.org toni@cs.toronto.edu omer.reingold@gmail.com
                 aaroth@cis.upenn.edu. Google Research, Mountain View, CA
                 94043, USA. dwork@microsoft.com vitaly@post.harvard.edu
                 m@mrtz.org toni@cs.toronto.edu omer.reingold@gmail.com
                 aaroth@cis.upenn.edu. Department of Computer Science,
                 University of Toronto, Toronto, Ontario M5S 3G4, Canada.
                 dwork@microsoft.com vitaly@post.harvard.edu m@mrtz.org
                 toni@cs.toronto.edu omer.reingold@gmail.com
                 aaroth@cis.upenn.edu. Samsung Research America, Mountain View,
                 CA 94043, USA. dwork@microsoft.com vitaly@post.harvard.edu
                 m@mrtz.org toni@cs.toronto.edu omer.reingold@gmail.com
                 aaroth@cis.upenn.edu. Department of Computer and Information
                 Science, University of Pennsylvania, Philadelphia, PA 19104,
                 USA. dwork@microsoft.com vitaly@post.harvard.edu m@mrtz.org
                 toni@cs.toronto.edu omer.reingold@gmail.com
                 aaroth@cis.upenn.edu.",
  abstract    = "Misapplication of statistical data analysis is a common cause
                 of spurious discoveries in scientific research. Existing
                 approaches to ensuring the validity of inferences drawn from
                 data assume a fixed procedure to be performed, selected before
                 the data are examined. In common practice, however, data
                 analysis is an intrinsically adaptive process, with new
                 analyses generated on the basis of data exploration, as well
                 as the results of previous analyses on the same data. We
                 demonstrate a new approach for addressing the challenges of
                 adaptivity based on insights from privacy-preserving data
                 analysis. As an application, we show how to safely reuse a
                 holdout data set many times to validate the results of
                 adaptively chosen analyses.",
  journal     = "Science",
  volume      =  349,
  number      =  6248,
  pages       = "636--638",
  month       =  "7~" # aug,
  year        =  2015,
  keywords    = "Latently",
  language    = "en"
}