<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <channel>
    <title>DSpace Collection: CeDER Working Papers</title>
    <link>http://hdl.handle.net/2451/14813</link>
    <description />
    <items>
      <rdf:Seq>
        <rdf:li resource="http://hdl.handle.net/2451/14128" />
        <rdf:li resource="http://hdl.handle.net/2451/27629" />
        <rdf:li resource="http://hdl.handle.net/2451/14812" />
        <rdf:li resource="http://hdl.handle.net/2451/28533" />
        <rdf:li resource="http://hdl.handle.net/2451/28313" />
        <rdf:li resource="http://hdl.handle.net/2451/14121" />
        <rdf:li resource="http://hdl.handle.net/2451/14100" />
        <rdf:li resource="http://hdl.handle.net/2451/14757" />
        <rdf:li resource="http://hdl.handle.net/2451/14102" />
        <rdf:li resource="http://hdl.handle.net/2451/14151" />
        <rdf:li resource="http://hdl.handle.net/2451/14112" />
        <rdf:li resource="http://hdl.handle.net/2451/27735" />
        <rdf:li resource="http://hdl.handle.net/2451/23407" />
        <rdf:li resource="http://hdl.handle.net/2451/14117" />
        <rdf:li resource="http://hdl.handle.net/2451/14105" />
        <rdf:li resource="http://hdl.handle.net/2451/14109" />
        <rdf:li resource="http://hdl.handle.net/2451/14107" />
        <rdf:li resource="http://hdl.handle.net/2451/29799" />
        <rdf:li resource="http://hdl.handle.net/2451/28302" />
        <rdf:li resource="http://hdl.handle.net/2451/14115" />
        <rdf:li resource="http://hdl.handle.net/2451/31253" />
        <rdf:li resource="http://hdl.handle.net/2451/14133" />
        <rdf:li resource="http://hdl.handle.net/2451/14748" />
        <rdf:li resource="http://hdl.handle.net/2451/28304" />
        <rdf:li resource="http://hdl.handle.net/2451/14103" />
        <rdf:li resource="http://hdl.handle.net/2451/14761" />
        <rdf:li resource="http://hdl.handle.net/2451/14106" />
        <rdf:li resource="http://hdl.handle.net/2451/29922" />
        <rdf:li resource="http://hdl.handle.net/2451/14108" />
        <rdf:li resource="http://hdl.handle.net/2451/14119" />
        <rdf:li resource="http://hdl.handle.net/2451/27716" />
        <rdf:li resource="http://hdl.handle.net/2451/29885" />
        <rdf:li resource="http://hdl.handle.net/2451/14801" />
        <rdf:li resource="http://hdl.handle.net/2451/14148" />
        <rdf:li resource="http://hdl.handle.net/2451/14098" />
        <rdf:li resource="http://hdl.handle.net/2451/28084" />
        <rdf:li resource="http://hdl.handle.net/2451/28089" />
        <rdf:li resource="http://hdl.handle.net/2451/23402" />
        <rdf:li resource="http://hdl.handle.net/2451/14124" />
        <rdf:li resource="http://hdl.handle.net/2451/14099" />
        <rdf:li resource="http://hdl.handle.net/2451/14811" />
        <rdf:li resource="http://hdl.handle.net/2451/14101" />
        <rdf:li resource="http://hdl.handle.net/2451/25882" />
        <rdf:li resource="http://hdl.handle.net/2451/14872" />
        <rdf:li resource="http://hdl.handle.net/2451/31279" />
        <rdf:li resource="http://hdl.handle.net/2451/14111" />
        <rdf:li resource="http://hdl.handle.net/2451/29918" />
        <rdf:li resource="http://hdl.handle.net/2451/14114" />
        <rdf:li resource="http://hdl.handle.net/2451/27680" />
        <rdf:li resource="http://hdl.handle.net/2451/14113" />
        <rdf:li resource="http://hdl.handle.net/2451/29649" />
        <rdf:li resource="http://hdl.handle.net/2451/14104" />
        <rdf:li resource="http://hdl.handle.net/2451/14758" />
        <rdf:li resource="http://hdl.handle.net/2451/14760" />
        <rdf:li resource="http://hdl.handle.net/2451/14808" />
        <rdf:li resource="http://hdl.handle.net/2451/23783" />
        <rdf:li resource="http://hdl.handle.net/2451/29941" />
        <rdf:li resource="http://hdl.handle.net/2451/23604" />
        <rdf:li resource="http://hdl.handle.net/2451/29585" />
        <rdf:li resource="http://hdl.handle.net/2451/31553" />
        <rdf:li resource="http://hdl.handle.net/2451/14116" />
        <rdf:li resource="http://hdl.handle.net/2451/28092" />
        <rdf:li resource="http://hdl.handle.net/2451/31303" />
        <rdf:li resource="http://hdl.handle.net/2451/30284" />
        <rdf:li resource="http://hdl.handle.net/2451/15026" />
        <rdf:li resource="http://hdl.handle.net/2451/14150" />
        <rdf:li resource="http://hdl.handle.net/2451/14123" />
        <rdf:li resource="http://hdl.handle.net/2451/14759" />
        <rdf:li resource="http://hdl.handle.net/2451/14122" />
        <rdf:li resource="http://hdl.handle.net/2451/14110" />
        <rdf:li resource="http://hdl.handle.net/2451/14127" />
        <rdf:li resource="http://hdl.handle.net/2451/28065" />
        <rdf:li resource="http://hdl.handle.net/2451/14152" />
        <rdf:li resource="http://hdl.handle.net/2451/14749" />
        <rdf:li resource="http://hdl.handle.net/2451/29801" />
        <rdf:li resource="http://hdl.handle.net/2451/14120" />
        <rdf:li resource="http://hdl.handle.net/2451/14118" />
        <rdf:li resource="http://hdl.handle.net/2451/14810" />
        <rdf:li resource="http://hdl.handle.net/2451/25886" />
        <rdf:li resource="http://hdl.handle.net/2451/14809" />
      </rdf:Seq>
    </items>
  </channel>
  <image>
    <title>The Channel Image</title>
    <url>http://archive.nyu.edu/retrieve/28175</url>
    <link>http://hdl.handle.net/2451/14813</link>
  </image>
  <textInput>
    <title>The Collection's search engine</title>
    <description>Search the Channel</description>
    <name>search</name>
    <link>http://archive.nyu.edu/simple-search</link>
  </textInput>
  <item rdf:about="http://hdl.handle.net/2451/14128">
    <title>Viral Marketing: Identifying Likely Adopters Via Consumer Networks</title>
    <link>http://hdl.handle.net/2451/14128</link>
    <description>Title: Viral Marketing: Identifying Likely Adopters Via Consumer Networks&lt;br/&gt;&lt;br/&gt;Hill, Shawndra; Provost, Foster; Volinsky, Chris&lt;br/&gt;&lt;br/&gt;Abstract: We investigate the hypothesis: those consumers who have communicatedwith a customer of a particular service have increased likelihood ofadopting the service. We survey the diverse literature on such&amp;quot;viral marketing,&amp;quot; providing a categorization of the specificresearch questions asked, the data analyzed, and the statistical methodsused. We highlight a striking gap in the literature: no prior study hashad both of the two key types of data necessary to provide directsupport for the hypothesis: data on communications between consumers,and data on product adoption. We suggest a type of service for whichboth types of data are available telecommunications services. Then, fora particular telecommunication service, we show support for thehypothesis. Specifically, we show three main results. 1) there is such a&amp;quot;viral&amp;quot; effect and it is statistically significant, resultingin take rates 3-5 times greater than a baseline group; 2) attributesconstructed from the consumer network can improve models for ranking oftargeted customers by likelihood of adoption, and 3) observing thenetwork allows the firm to target new customers that would have fallenthrough the cracks, because they would not have been identified basedsolely on the traditional set of attributes used for marketing by thefirm. We close with a discussion of challenges and opportunities forresearch in this area. For example, can one determine whether the reasonfor the viral effect is customer advocacy (e.g., via &amp;quot;word ofmouth&amp;quot;) versus network-identified homophily?</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/27629">
    <title>Understanding, Estimating, and Incorporating Output Quality Into Join
Algorithms For Information Extraction</title>
    <link>http://hdl.handle.net/2451/27629</link>
    <description>Title: Understanding, Estimating, and Incorporating Output Quality Into JoinAlgorithms For Information Extraction&lt;br/&gt;&lt;br/&gt;Jain, Alpa; Ipeirotis, Panagiotis G.; Gravano, Luis; Doan, Anhai&lt;br/&gt;&lt;br/&gt;Abstract: Information extraction (IE) systems are trained to extract specificrelations from text databases. Real-world applications often requirethat the output of multiple IE systems be joined to produce the data ofinterest. To optimize the execution of a join of multiple extractedrelations, it is not sufficient to consider only execution time. Infact, the quality of the join output is of critical importance: unlikein the relational world, different join execution plans can produce joinresults of widely different quality whenever IE systems are involved. Inthis paper, we develop a principled approach to understand, estimate,and incorporate output quality into the join optimization process overextracted relations. We argue that the output quality is affected by (a)the configuration of the IE systems used to process the documents, (b)the document retrieval strategies used to retrieve documents, and (c)the actual join algorithm used. Our analysis considers a variety of joinalgorithms from relational query optimization, and predicts the outputquality &amp;ndash;and, of course, the execution time&amp;ndash; of thealternate execution plans. We establish the accuracy of our analyticalmodels, as well as study the effectiveness of a quality-aware joinoptimizer, with a large-scale experimental evaluation over real-worldtext collections and state-of-the-art IE systems.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14812">
    <title>Towards a Query Optimizer for Text-Centric Tasks</title>
    <link>http://hdl.handle.net/2451/14812</link>
    <description>Title: Towards a Query Optimizer for Text-Centric Tasks&lt;br/&gt;&lt;br/&gt;Ipeirotis, Panagiotis G.; Agichtein, Eugene; Jain, Pranay; Gravano, Luis&lt;br/&gt;&lt;br/&gt;Abstract: Text is ubiquitous and, not surprisingly, many important applicationsrely on textual data for a variety of tasks. As a notable example,information extraction applications derive structured relations fromunstructured text; as another example, focused crawlers explore the webto locate pages about specific topics. Execution plans for text-centrictasks follow two general paradigms for processing a text database:either we can scan, or &amp;quot;crawl,&amp;quot; the text database or,alternatively, we can exploit search engine indexes and retrieve thedocuments of interest via carefully crafted queries constructed intask-specific ways. The choice between crawl- and query-based executionplans can have a substantial impact on both execution time and output&amp;quot;completeness&amp;quot; (e.g., in terms of recall). Nevertheless, thischoice is typically ad-hoc and based on heuristics or plain intuition.In this article, we present fundamental building blocks to make thechoice of execution plans for text-centric tasks in an informed,cost-based way. Towards this goal, we show how to analyze query- andcrawl-based plans in terms of both execution time and outputcompleteness. We adapt results from random-graph theory and statisticsto develop a rigorous cost model for the execution plans. Our cost modelreflects the fact that the performance of the plans depends onfundamental task-specific properties of the underlying text databases.We identify these properties and present efficient techniques forestimating the associated parameters of the cost model. We also presenttwo optimization approaches for text-centric tasks that rely on thecost-model parameters and select efficient execution plans. Overall, ouroptimization approaches help build efficient execution plans for a task,resulting in significant efficiency and output completeness benefits. Wecomplement our results with a large-scale experimental evaluation forthree important text-centric tasks and over multiple real-life data sets.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28533">
    <title>Toward optimal allocation of human resources for active learning with
application to safe advertising</title>
    <link>http://hdl.handle.net/2451/28533</link>
    <description>Title: Toward optimal allocation of human resources for active learning withapplication to safe advertising&lt;br/&gt;&lt;br/&gt;Attenberg, Josh; Provost, Foster</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28313">
    <title>The Gestalt in Graphs: Prediction Using Economic Networks</title>
    <link>http://hdl.handle.net/2451/28313</link>
    <description>Title: The Gestalt in Graphs: Prediction Using Economic Networks&lt;br/&gt;&lt;br/&gt;Dhar, Vasant; Oestreicher-Singer, Gal; Sundararajan, Arun; Umyarov, Akhmed&lt;br/&gt;&lt;br/&gt;Abstract: We define an economic network as a linked set of entities, where linksare created by actual realizations of shared economic outcomes betweenentities. Such networks are becoming increasingly prevalent on theInternet, an example being the copurchase netwok on Amazon whereentities are books and links designate which pairs were purchasedsimultaneously. Our dataset covers a diverse set of books spanning over400 categories over a period of three years with a total of over 70million observations. To our knowledge, this is the first large scalestudy showing that an economic network contains useful predictiveinformation that is distributed in the network. We show that an economicnetwork contains predictive information. Specifically, we demonstratethat an entity&amp;rsquo;s future demand is more accurately predicted bycombining its historical demand with that of its neighbors than byconsidering its demand alone. In other words, if you want to know whatyour state will be in the future, consider what is happening to yourneighbors now. This result could apply to other economic networks whereoutcomes of sets of entities tend to be related.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14121">
    <title>The Emergence of Boundary Spanning Competence in Practice: Implications
for Information Systems' Implementation Use</title>
    <link>http://hdl.handle.net/2451/14121</link>
    <description>Title: The Emergence of Boundary Spanning Competence in Practice: Implicationsfor Information Systems' Implementation Use&lt;br/&gt;&lt;br/&gt;Levina, Natalia; Vaaste, Emmanuelle&lt;br/&gt;&lt;br/&gt;Abstract: Knowledge Management (KM) literature has centrally focused onorganization's ability to build practices that integrate diverseexpertise across professional, organizational, industry and otherboundaries. In this paper we investigate how an organizationalcompetence in boundary spanning emerges in practice. We draw on theconcepts of boundary spanner and boundary object and on thepractice-based view of KM in organizations to understand the emergenceof boundary spanning in practice, which we define as relating practicesfrom diverse fields. We contrast data from two qualitative, longitudinalfield studies to draw our conclusions. We argue that for boundaryspanning to emerge in practice a new joint field, which unites agent ina common pursuit, needs to be produced. Engagement of agents in thispractice partially transforms their practices in local fields so as toaccommodate the interests of their counterparts. Those agents who engagein negotiating the nature of this new field become boundaryspanners-in-practice. Through their engagements in the new joint fieldand diverse local practices boundary spanners-in-practice produce anduse objects which become locally useful and acquire a joint identitythrough their use &amp;acirc;   boundary objects-in-use. Through dataanalysis we find, first, that nominated boundary spanners and designatedboundary objects do not always become boundary spanners-in-practice andboundary objects-in-use. Second, we outline the conditions necessary forboundary spanners-in-practice to emerge, including the need for them tobecome legitimate, albeit peripheral, participants in the practices ofthe fields that they span. Thirdly, we show how boundaryspanners-in-practice use their symbolic, cultural, social, and economicresources (capital) to build the new joint field. Finally, we examinethe tensions involved in a) the nomination of agents as boundaryspanners and artifacts as boundary objects; b) the growth of the newjoint field; c) agents&amp;acirc;   choice in investing in the new jointfield; and d) spanning one at the expense of another kind of boundary.We conclude by drawing implications for IS implementation and use.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14100">
    <title>The Economic Incentives for Sharing Security Information</title>
    <link>http://hdl.handle.net/2451/14100</link>
    <description>Title: The Economic Incentives for Sharing Security Information&lt;br/&gt;&lt;br/&gt;Gal-Or, Esther; Ghose, Anindya&lt;br/&gt;&lt;br/&gt;Abstract: Given that Information Technology (IT) security has emerged as animportant issue in the last few years, the subject of securityinformation sharing among firms, as a tool to minimize securitybreaches, has gained the interest of practitioners and academics. Topromote the disclosure and sharing of cyber-security information amongfirms, the US federal government has encouraged the establishment ofmany industry based Information Sharing &amp;amp; Analysis Centers (ISACs)under Presidential Decision Directive 63. Sharing securityvulnerabilities and technological solutions related to methods forpreventing, detecting and correcting security breaches, is thefundamental goal of the ISACs. However, there are a number ofinteresting economic issues that will affect the achievement of thisgoal. Using game theory, we develop an analytical framework toinvestigate the competitive implications of sharing security informationand investments in security technologies. We find that securitytechnology investments and security information sharing act as&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;strategiccomplements&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc; inequilibrium. Our results suggest that information sharing is morevaluable when product substitutability is higher, implying that suchsharing alliances yield greater benefits in more competitive industries.We also highlight that the benefits from such information sharingalliances increase with the size of the firm. We compare the levels ofinformation sharing and technology investments obtained when firmsbehave independently (Bertrand-Nash) to those selected by an ISAC whichmaximizes social welfare or joint industry profits. Our results help uspredict the consequences of establishing organizations such as ISACs,CERT or InfraGard by the federal government.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14757">
    <title>The Dimensions of Reputation in Electronic Markets</title>
    <link>http://hdl.handle.net/2451/14757</link>
    <description>Title: The Dimensions of Reputation in Electronic Markets&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Ipeirotis, Panagiotis G.; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: We present a framework for identifying the different dimensions ofonline reputation and characterizing their influence on the pricingpower of sellers. Our theory predicts that sellers with better recordedonline reputation can successfully charge higher prices than competingsellers of identical products, and that their pricing power increaseswith their recorded level of experience. We develop and implement a newtext mining technique that identities and quantitatively assessesdimensions of importance in reputation profiles, and use this techniqueto create a new data set containing detailed reputation profiles andprices for sellers in over 9,500 transactions for consumer software onAmazon.com's online secondary marketplace. The estimation of a set ofeconometric models on this data set validates the predictions of ourtheory, and further, ranks these dimensions of reputation based on theireffect on measured seller value, identifying those that have the mostsignificant impact on reputation. This paper is the first study thatintegrates econometric and text mining techniques toward a more completeanalysis of the information captured by reputation systems, and itpresents new evidence of the importance of their effective and judicious design.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14102">
    <title>Strategic Impact of Internet Referral Services on Channel Profits</title>
    <link>http://hdl.handle.net/2451/14102</link>
    <description>Title: Strategic Impact of Internet Referral Services on Channel Profits&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Mukhopadhyay, Tridas; Rajan, Uday&lt;br/&gt;&lt;br/&gt;Abstract: Internet Referral Services, hosted either by independent third-partyinfomediaries or by manufacturers serve as&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;lead-generators&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;in electronic marketplaces, directing consumer traffic to particularretailers. In a model of price dispersion with mixed strategyequilibria, we investigate the competitive implications of theseinstitutions on retailer and manufacturer pricing strategies as well astheir impact on channel structures and distribution of profits. Offline,retailers face a higher customer acquisition cost. In return, they canengage in price discrimination. Online, they save on the acquisitioncosts, but lose the ability to price discriminate. This criticaltradeoff drivesfirms&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;equilibrium strategies. The establishment of a referral service is astrategic decision by the manufacturer, in response to a third-partyinfomediary. It leads to an increase in channel profits and areallocation of the increased surplus to the manufacturer, via thefranchise fees. Further, it enables the manufacturer to respond to aninfomediary, by giving itself a wider leeway to set the unit wholesalefee to the profit maximizing level. We discuss implications of referralservices on channel coordination issues, and whether a two part tariffcan be successfully used to maximize channel profits. Contrary to priorliterature, we find that when retailers can price discriminate amongconsumers, the manufacturer may not set the wholesale price to marginalcost to coordinate the channel. Consistent with anecdotal evidence, ourmodel predicts that while it is optimal for an infomediary to enrollonly one retailer, it is optimal for a manufacturer to enroll bothretailers. Finally, our results show that under some circumstances, themanufacturer even benefits from the presence of the competing referralinfomediary and hence, will not want to eliminate it.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14151">
    <title>Strategic Analysis of Petty Corruption:  Entrepreneurs and Bureaucrats</title>
    <link>http://hdl.handle.net/2451/14151</link>
    <description>Title: Strategic Analysis of Petty Corruption:  Entrepreneurs and Bureaucrats&lt;br/&gt;&lt;br/&gt;Lambert-Mogiliansky, Ariane; Majumdar, Mukul; Radner, Roy&lt;br/&gt;&lt;br/&gt;Abstract: This paper develops a game-theoretic model of &amp;quot;pettycorruption&amp;quot; by gov- ernment officials. Such corruption iswidespread, especially (but not only) in developing and transitioneconomies. The model goes beyond the previ- ously published studies inthe way it describes the structure of bureaucratic &amp;quot;tracks,&amp;quot;and the information among the participants. Entrepreneurs apply, insequence, to a &amp;quot;track&amp;quot; of two or more bureaucrats in aprescribed order for approval of their projects. Our first resultestablishes that in a one-shot situation no project ever gets approved.This result leads us to consider a repeated interaction setting. In thatcontext we characterize in more detail the trigger-strategy equilibriathat minimize the social loss due to the system of bribes, and thosethat maximize the expected total bribe income of the bureaucrats. Theresults are used to shed some light on two much advocatedanti-corruption policies: the single window policy and rotation of bureaucrats.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14112">
    <title>Software Versioning and Quality Degradation?  An Exploratory Study of
the Evidence</title>
    <link>http://hdl.handle.net/2451/14112</link>
    <description>Title: Software Versioning and Quality Degradation?  An Exploratory Study ofthe Evidence&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: We present a framework for measuring software quality using pricing anddemand data, and empirical estimates that quantify the extent of qualitydegradation associated with software versioning. Using a 7-month,108-product panel of software sales from Amazon.com, we document theextent to which quality varies across different software versions,estimating quality degradation that ranges from as little as 8% to asmuch as 56% below that of the corresponding flagship version. Consistentwith prescriptions from the theory of vertical differentiation, we alsofind that an increase in the total number of versions is associated withan increase in the difference in quality between the highest and lowestquality versions, and a decrease in the quality difference between&amp;quot;neighboring&amp;quot; versions. We compare our estimates with thosederived from two sets of subjective measures of quality, based on CNETeditorial ratings and Amazon.com user reviews, and discuss competinginterpretations of the significant differences that emerge from thiscomparison. As the first empirical study of software versioning that isbased on both subjective and econometrically estimated measures ofquality, this paper provides a framework for testing a wide variety ofresults in IS that are based on related models of verticaldifferentiation, and its findings have important implications forstudies that treat web-based user ratings as cardinal data.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/27735">
    <title>Social Network Collaborative Filtering</title>
    <link>http://hdl.handle.net/2451/27735</link>
    <description>Title: Social Network Collaborative Filtering&lt;br/&gt;&lt;br/&gt;Zheng, Rong; Wilkinson, Dennis; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: This paper demonstrates that &amp;quot;social network collaborativefiltering&amp;quot; (SNCF), wherein user-selected like-minded alters areused to make predictions, can rival traditional user-to-usercollaborative filtering (CF) in predictive accuracy. Us-ing a uniquedata set from an online community where users rated items and alsocreated social networking links specifically intended to representlike-minded &amp;ldquo;allies,&amp;rdquo; we use SNCF and traditional CF topredict ratings by net-worked users. We find that SNCF using generic&amp;quot;friend&amp;quot; alters is moderately worse than the better CFtechniques, but outperforms benchmarks such as by-item or by-useraverage rating; generic friends often are not like-minded. However, SNCFusing &amp;quot;ally&amp;quot; alters is competitive with CF. These results aresignificant because SNCF is tremendously more computationally efficientthan traditional user-user CF and may be implemented in large-scale webcommerce and social networking communities. It is notoriously difficultto distinguish the contributions of social influence (where alliesinfluence users) and &amp;quot;social&amp;rdquo; selection (where users aresimply effective at selecting like-minded people as their allies).Nonetheless, comparing similarity over time, we do show no evi-dence ofstrong social influence among allies or friends.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/23407">
    <title>Social Network Collaborative Filtering</title>
    <link>http://hdl.handle.net/2451/23407</link>
    <description>Title: Social Network Collaborative Filtering&lt;br/&gt;&lt;br/&gt;Zheng, Rong; Provost, Foster; Ghose, Anindya&lt;br/&gt;&lt;br/&gt;Abstract: This paper reports on a preliminary empirical study comparing methodsfor collaborative filtering (CF) using explicit data on consumers&amp;rsquo;social networks. To our knowledge it is the first study to carefullyevaluate the potential of explicit, publicly represented social networksfor making product recommendations. Understanding social-network CF isimportant because traditional CF over a large consumer base istremendously expensive computationally. An often-ignored aspect of CF isthe selection of the set of users from which to make recommendations.Social theory tells us that social relationships are likely to connectsimilar people. If this similarity is in line with the recommendationtask, they may provide a small, dense set of &amp;ldquo;recommenders&amp;rdquo;for CF. We examine a unique dataset from Amazon.com that contains asocial network of consumer-selected friends. We examine two ways toincorporate social-network information into CF: using the social networkto restrict the set of recommenders selected, and (further) usingproximity in the social network to modify the traditional CFcalculation. The results show that that CF with social-network membersselected as recommenders can be remarkably superior as compared tocollaborative filtering with the recommenders not socially connected.Once the social network is selected, social network proximity does notseem to improve recommendations.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14117">
    <title>Simple Models and Classification in Networked Data</title>
    <link>http://hdl.handle.net/2451/14117</link>
    <description>Title: Simple Models and Classification in Networked Data&lt;br/&gt;&lt;br/&gt;Macskassy, Sofus; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: When entities are linked by explicit relations, classification methodsthat take advantage of the network can perform substantially better thanmethods that ignore the network. This paper argues that studies ofrelational classification in networked data should include simplenetwork-only methods as baselines for comparison, in addition to thenon-relational baselines that generally are used. In particular,comparing more complex algorithms with algorithms that only consider thenetwork (and not the features of the entities) allows one to factor outthe contribution of the network structure itself to the predictive powerof the model. We examine several simple methods for network-onlyclassification on previously used relational data sets, and show thatthey can perform remarkably well. The results demonstrate that theinclusion of network-only classifiers can shed new light on studies ofrelational learners.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14105">
    <title>Significance Testing Against the Random Model for Scoring Models on Top
k Predictions</title>
    <link>http://hdl.handle.net/2451/14105</link>
    <description>Title: Significance Testing Against the Random Model for Scoring Models on Topk Predictions&lt;br/&gt;&lt;br/&gt;Macskassy, Sofus&lt;br/&gt;&lt;br/&gt;Abstract: Performance at top k predictions, where instances are ranked by a(learned) scoring model, has been used as an evaluation metric inmachine learning for various reasons such as where the entire corpus isunknown (e.g., the web) or where the results are to be used by a personwith limited time or resources (e.g., ranking financial news storieswhere the investor only has time to look at relatively few stories perday). This evaluation metric is primarily used to report whether theperformance of a given method is significantly better than other(baseline) methods. It has not, however, been used to show whether theresult is significant when compared to the simplest of baselines&amp;acirc; the random model. If no models outperform the random model ata given confidence interval, then the results may not be worthreporting. This paper introduces a technique to perform an analysis ofthe expected performance of the top k predictions from the random modelgiven k and a p-value on an evaluation dataset D. The technique is basedon the realization that the distribution of the number of positives seenin the top k predictions follows a hypergeometric distribution, whichhas welldefined statistical density functions. As this distribution isdiscrete, we show that using parametric estimations based on a binomialdistribution are almost always in complete agreement with the discretedistribution and that, if they differ, an interpolation of the discretebounds gets very close to the parametric estimations. The technique isdemonstrated on results from three prior published works, in which itclearly shows that even though performance is greatly increased(sometimes over 100%) with respect to the expected performance of therandom model (at p = 0.5), these results, although qualitativelyimpressive, are not always as significant (p = 0.1) as might besuggested by the impressive qualitative improvements. The technique isused to show, given k, both how many positive instances are needed toachieve a specific significance threshold is as well as how significanta given top k performance is. The technique when used in a more globalsetting is able to identify the crossover points, with respect to k,when a method becomes significant for a given p. Lastly, the techniqueis used to generate a complete confidence curve, which shows a generaltrend over all k and visually shows where a method is significantlybetter than the random model over all values of k.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14109">
    <title>RQL: A Query Language For Recommender Systems</title>
    <link>http://hdl.handle.net/2451/14109</link>
    <description>Title: RQL: A Query Language For Recommender Systems&lt;br/&gt;&lt;br/&gt;Adomavicius, Gediminas; Tuzhilin, Alexander; Zheng, Rong&lt;br/&gt;&lt;br/&gt;Abstract: Initially popularized by Amazon.com, recommendation technologies havebecome widespread over the past several years, both in the industry andacademia. The traditional two-dimensional approach to recommendersystems, involving the dimensions of Users and Items, has beensubsequently extended to the multidimensional approach supportingadditional contextual dimensions and OLAP-type aggregation capabilities.Furthermore, the class of all possible recommendations available to theusers in traditional recommender systems is typically determined by thevendor and is quite limited. In this paper we address this limitation byproposing a query language RQL that allows the users to formulatevarious types of recommendation requests on their own. RQL adapts OLAPqueries to the domain of recommender systems and, therefore, is able tosupport both the traditional two-dimensional and the more complexmultidimensional recommender systems. The paper also presents arecommendation algebra that allows mapping RQL queries into thealgebraic expressions for the query processing purposes. Finally, thepaper presents a method for executing RQL queries.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14107">
    <title>ROC Confidence Bands: An Empirical Study</title>
    <link>http://hdl.handle.net/2451/14107</link>
    <description>Title: ROC Confidence Bands: An Empirical Study&lt;br/&gt;&lt;br/&gt;Mcskassy, Sofus; Provost, Foster; Rosset, Saharon&lt;br/&gt;&lt;br/&gt;Abstract: This paper is about constructing confidence bands around an ROC curvesuch that (1 - \delta)% of the ROC curves traced by data sets of size rwill fall completely within the bands. We introduce to the machinelearning community three methods from the medical field that areapplicable to generate such bands. We then evaluate these methods on thesimple case of&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;binormal&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;distributions&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;the scores for positive and the score for negative instances are drawnfrom two normal distributions. We show that none of the methods generateappropriate bands and investigate two types of variances problems. Weshow that widening the bands does not produce the proper bandwidths butthat fitting a normal distribution to the observed drawn samples anddrawing samples from this distribution (parametric bootstrap) doesgenerate bands that are much closer to the desired coverage althoughstill not perfect. We tested the original methods as well as parametricbootstrap on the covertype data set from the UCI ML-repority. Theoriginal methods perform the same as in the synthetic case, whereas theparametric bootstrap technique did not yield the expected results. Thisis primarily due to not being able to generate a good fit for the scoredistributions. Whether it is possible to fit well-behaving parametricdistribution to learned models is an open question we leave to themachine learning community to answer.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29799">
    <title>Repeated Labeling Using Multiple Noisy Labelers</title>
    <link>http://hdl.handle.net/2451/29799</link>
    <description>Title: Repeated Labeling Using Multiple Noisy Labelers&lt;br/&gt;&lt;br/&gt;Ipeirotis, Panagiotis G.; Provost, Foster; Sheng, Victor; Wang, Jing&lt;br/&gt;&lt;br/&gt;Abstract: This paper addresses the repeated acquisition of labels for data itemswhen the labeling is imperfect. We examine the improvement (or lackthereof) in data quality via repeated labeling, and focus especially onthe improvement of training labels for supervised induction. With theoutsourcing of small tasks becoming easier, for example via Amazon'sMechanical Turk, it often is possible to obtain less-than-expertlabeling at low cost. With low-cost labeling, preparing the unlabeledpart of the data can become considerably more expensive than labeling.We present repeated-labeling strategies of increasing complexity, andshow several main results. (i) Repeated-labeling can improve labelquality and model quality, but not always. (ii) When labels are noisy,repeated labeling can be preferable to single labeling even in thetraditional setting where labels are not particularly cheap. (iii) Assoon as the cost of processing the unlabeled data is not free, even thesimple strategy of labeling everything multiple times can giveconsiderable advantage. (iv) Repeatedly labeling a carefully chosen setof points is generally preferable, and we present a set of robusttechniques that combine different notions of uncertainty to select datapoints for which quality should be improved. The bottom line: theresults show clearly that when labeling is not perfect, selectiveacquisition of multiple labels is a strategy that data miners shouldhave in their repertoire. For certain label-quality/cost regimes, thebenefit is substantial.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28302">
    <title>Relevance-based Retrieval on Hidden-Web Text Databases without Ranking Support</title>
    <link>http://hdl.handle.net/2451/28302</link>
    <description>Title: Relevance-based Retrieval on Hidden-Web Text Databases without Ranking Support&lt;br/&gt;&lt;br/&gt;Hristidis, Vagelis; Hu, Yuheng; Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: Many online or local data sources provide powerful querying mechanismsbut limited ranking capabilities. For instance, PubMed allows users tosubmit highly expressive Boolean keyword queries, but ranks the queryresults by date only. However, a user would typically prefer a rankingby relevance, measured by an Information Retrieval (IR) rankingfunction. The naive approach would be to submit a disjunctive query withall query keywords, retrieve the returned documents, and then re-rankthem. Unfortunately, such an operation would be very expensive due tothe large number of results returned by disjunctive queries.  In thispaper we present algorithms that return the top results for a query,ranked according to an IR-style ranking function, while operating on topof a source with a Boolean query interface with no ranking capabilities(or a ranking capability of no interest to the end user). The algorithmsgenerate a series of conjunctive queries that return only documents thatare candidates for being highly ranked according to a relevance metric.Our approach can also be applied to other settings where the ranking ismonotonic on a set of factors (query keywords in IR) and the sourcequery interface is a Boolean expression of these factors. Ourcomprehensive experimental evaluation on the PubMed database and a TRECdataset show that we achieve order of magnitude improvement compared tothe current baseline approaches.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14115">
    <title>Recommendation Technologies:  Survey of Current Methods and Possible Extensions</title>
    <link>http://hdl.handle.net/2451/14115</link>
    <description>Title: Recommendation Technologies:  Survey of Current Methods and Possible Extensions&lt;br/&gt;&lt;br/&gt;Adomavicius, Gediminas; Tuzhilin, Alex&lt;br/&gt;&lt;br/&gt;Abstract: The paper presents a survey of the field of recommender systems anddescribes current recommendation methods that are usually classifiedinto the following three main categories: content-based, collaborative,and hybrid recommendation approaches. The paper also describes variouslimitations of current recommendation methods and discusses possibleextensions that can improve recommendation capabilities. Theseextensions include, among others, improvement of understanding of usersand items, incorporation of the contextual information into therecommendation process, support for multi-criteria ratings, andprovision of more flexible and less intrusive types of recommendations.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/31253">
    <title>Pseudo-social network targeting from consumer transaction data</title>
    <link>http://hdl.handle.net/2451/31253</link>
    <description>Title: Pseudo-social network targeting from consumer transaction data&lt;br/&gt;&lt;br/&gt;Martens, David; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: This design science paper presents a method for targeting consumersbased on a 'pseudo-social network' (PSN): consumers are linked if theytransfer money to the same entities. A marketer can target thoseindividuals that are strongly connected to key individuals. We presentthe PSN design and a large-scale empirical study using data from a majorbank. For two different product offerings, consumers that are close toexisting customers in the PSN have significantly higher take rates thanthe 'most likely' candidates identified by state-of-the-artsocio-demographic (SD) predictive modeling. Interestingly, the PSNtargeting only does better for the closest neighbors. However, thedifferent models capture different information: combining the two doessignificantly better than either alone. The results demonstrate thatsocial targeting can be applied broadly, to settings where the networkamong consumers is unlikely to be a true social network, but nonethelesscaptures inherent similarity.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14133">
    <title>Pricing Security Software: Theory and Evidence</title>
    <link>http://hdl.handle.net/2451/14133</link>
    <description>Title: Pricing Security Software: Theory and Evidence&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Sundararajan, Arun</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14748">
    <title>Pricing Models for On-Demand Computing</title>
    <link>http://hdl.handle.net/2451/14748</link>
    <description>Title: Pricing Models for On-Demand Computing&lt;br/&gt;&lt;br/&gt;Huang, Ke-Wei; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: On-demand computing provides a new way for companies to manage and usetheir IT infrastructure. This model of corporate computing radicallychanges the way companies pay for their IT infrastructure, basing it on&amp;quot;pay per use&amp;quot; rather than on the fixed infrastructureinvestments such companies are accustomed to. A clear theoreticalunderstanding of pricing on-demand computing is thus central to theviability and growth of this nascent industry. We contribute towardssuch an understanding in this paper by modeling the optimal pricing ofon-demand computing while taking four critical factors into account: thecosts of deploying IT in-house, the business value of this IT, the scaleof the provider&amp;acirc;s on-demand computing infrastructure, and thevariable costs of providing on-demand computing. Three distinct pricingmodels emerge as optimal among all possible pricing functions foron-demand computing. These models describe when volume discounting, freeusage and demand caps should be used to manage demand appropriately andprofitably. We also outline a likely path that the transformationtowards on-demand computing will follow &amp;acirc; under which low-usagecustomers are targeted initially, followed by a broadening of themarket, and finally, a focus on profiting from inducing adoption byhigh-usage customers &amp;acirc; and prescribe how the associated pricingmodels should evolve appropriately.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28304">
    <title>Prediction in Financial Markets: The Case for Small Disjuncts</title>
    <link>http://hdl.handle.net/2451/28304</link>
    <description>Title: Prediction in Financial Markets: The Case for Small Disjuncts&lt;br/&gt;&lt;br/&gt;Dhar, Vasant&lt;br/&gt;&lt;br/&gt;Abstract: Predictive models in regression and classification problems typicallyhave a single model that covers most, if not all, cases in the data. Atthe opposite end of the spectrum is a collection of models each of whichcovers a very small subset of the decision space. These are referred toas &amp;ldquo;small disjuncts.&amp;rdquo; The tradeoffs between the two types ofmodels have been well documented. Single models, especially linear ones,are easy to interpret and explain. In contrast, small disjuncts do notprovide as clean or as simple an interpretation of the data, and havebeen shown by several researchers to be responsible for adisproportionately large number of errors when applied to out of sampledata. This research provides a counterpoint, demonstrating that&amp;ldquo;simple&amp;rdquo; small disjuncts provide a credible model forfinancial market prediction, a problem with a high degree of noise. Arelated novel contribution of this paper is a simple method formeasuring the &amp;ldquo;yield&amp;rdquo; of a learning system, which is thepercentage of in sample performance that the learned model can beexpected to realize on out-of-sample data. Curiously, such a measure ismissing from the literature on regression learning algorithms.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14103">
    <title>Personalized Pricing and Quality Differentiation</title>
    <link>http://hdl.handle.net/2451/14103</link>
    <description>Title: Personalized Pricing and Quality Differentiation&lt;br/&gt;&lt;br/&gt;Choudhary, Vidyanand; Ghose, Anindya; Mukhopadhyay, Tridas; Rajan, Uday&lt;br/&gt;&lt;br/&gt;Abstract: We develop an analytical framework to investigate the competitiveimplications of personalized pricing (PP), whereby firms chargedifferent prices to different consumers, based on their willingness topay. We embed personalized pricing in a model of vertical productdifferentiation, and show how it affectsfirms&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc; choicesover quality. We show thatfirms&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc; optimalpricing strategies with PP may be non-monotonic in consumer valuations.When the PP firm has a high quality both firms raise their qualities,relative to the uniform pricing case. Conversely, when the PP firm haslow quality, both firms lower their qualities. Although many firms aretrying to implement such pricing policies, we find that a higher qualityfirm can actually be worse off with PP. While it is optimal for the firmadopting PP to increase product differentiation, the non-PP firm seeksto reduce differentiation by moving in closer in the quality space.While PP results in a wider market coverage, it also leads to aggravatedprice competition between firms. Since this entails a change inequilibrium qualities, the nature of the cost function determineswhether firms gain or lose by implementing such PP policies. Despite thethreat of first-degree price discrimination, we find that personalizedpricing with competing firms can lead to an overall increase in consumer welfare.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14761">
    <title>Personalized Pricing and Quality Design</title>
    <link>http://hdl.handle.net/2451/14761</link>
    <description>Title: Personalized Pricing and Quality Design&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Huang, Ke-Wei&lt;br/&gt;&lt;br/&gt;Abstract: We develop an analytical framework to investigate the competitiveimplications of personalized pricing and quality allocation (PPQ),whereby firms charge different prices and offer different qualities todifferent consumers, based on their willingness to pay. We embed PPQ ina model of spatial differentiation, and show how information aboutconsumer preferences affects multi-productfirms&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;choices over pricing schedules and product line offerings. We show thatfirms&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;optimal pricing strategies with PPQ will be non-monotonic in consumervaluations. Our model sheds light on the different product qualityschedules offered by firms, given that one or both firms implement PPQ.Contrary to prior literature on one-to-one marketing, we show that evensymmetric firms can avoid the well-knownPrisoner&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;sDilemma problem due to the quality enhancement effect at the individualconsumer level. The rent extraction effect due to quality enhancementdominates the adverse effect of price competition. Moreover, this resultis stronger when firms have a larger proportion of loyal consumers. Whenboth firms have PPQ, consumer surplus is non-monotonic in valuationssuch that some low valuation consumers get higher surplus than highvaluation consumers. For a wide range of fixed costs, we alsodemonstrate some results on the profitability of adopting PPQ and showthe emergence of asymmetric equilibria, where one firm adopts PPQ andthe other firm does not when the number of loyal customers is less thana critical value. We extend our analysis to asymmetric firms and showthat when one firm adopts PPQ, it always increases its quality levelwhile the other firm keeps its quality schedule unchanged compared towhen neither firm has PPQ. We demonstrate that a firm with an ex-ante,smaller loyal segment can be better off with PPQ.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14106">
    <title>Online Auction and List Price Revenue Management</title>
    <link>http://hdl.handle.net/2451/14106</link>
    <description>Title: Online Auction and List Price Revenue Management&lt;br/&gt;&lt;br/&gt;Caldentey, Rene; Vulcano, Gustavo&lt;br/&gt;&lt;br/&gt;Abstract: We analyze a revenue management problem in which a seller facing aPoisson arriving stream of customers operates an online multiunitauction. Customers have an alternative list price channel where to getthe product from. We consider two variants of this problem: In the firstone, the list price is an external channel run by another firm. In thesecond variant, the seller manages simultaneously both the auction andthe list price channels. Each consumer, trying to maximize his ownsurplus, must decide either to buy at the posted price and get the itemat no risk, or to join the auction and wait until its end, where thewinners are revealed and the auction price is disclosed. Our approachconsists of two parts. First, we study structural properties of theproblem, and show that the equilibrium strategy for both versions ofthis game is of the threshold type, meaning that a consumer will jointhe auction only if his arrival time is above a function of his ownvaluation. This consumer&amp;acirc;s strategy can be computed using aniterative algorithm in a function space, provably convergent under someconditions. Unfortunately, this procedure is computationally intensive.To overcome this, we formulate an asymptotic version of the problem, inwhich the demand rate and the initial number of units growproportionally large. We get a simple closed form for the equilibriumstrategy in this regime, which is then used as an approximated solutionfor the original problem. Numerical computations show that thisheuristic is very accurate. The asymptotic solution culminates then insimple and precise recipes for how bidders should behave, and how theseller should structure the auction, and price the product in the dualchannel case.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29922">
    <title>Online Active Inference and Learning</title>
    <link>http://hdl.handle.net/2451/29922</link>
    <description>Title: Online Active Inference and Learning&lt;br/&gt;&lt;br/&gt;Attenberg, Josh; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: We present a framework for active inference, the selective acquisitionof labels for cases at prediction time in lieu of using the estimatedlabels of a predictive model.  The framework generalizes prior work onprediction time label acquisition. We develop techniques within thisactive inference framework for classifying streams, for example, forclassifying web pages where online advertisements are being served. Suchstream applications present  novel complications; specifically, (i) wedon't know at the time of any label acquisition decision what instanceswe will see, and (ii) instances repeat based on some unknown (andpossibly skewed) distribution.  To propose a solution, we combine ideasfrom decision theory, cost-sensitive learning, on-line densityestimation, and on-line utility estimation.  The resulting model tellswhich instances to label so that by the end of the budget period, thebudget is best spent (in expectation).  We test the method on streamsfrom a real application and on partially synthetic streams.  The mainresults show that: (1) active inference on streams can indeed reduceerror cost substantially over not doing the on-line estimations, and (2)more sophisticated on-line estimation provides more reduction in error.We also discuss relationships with active learning: What if you alsoneed to learn the model while doing the active inference?</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14108">
    <title>Nonconvex Production Technology and Price Discrimination</title>
    <link>http://hdl.handle.net/2451/14108</link>
    <description>Title: Nonconvex Production Technology and Price Discrimination&lt;br/&gt;&lt;br/&gt;Jing, Bing; Radner, Roy&lt;br/&gt;&lt;br/&gt;Abstract: We revisit the issue of product line design by a monopolist and extendthe model of Mussa and Rosen (1978) in two ways. First, we consider thecase in which the unit cost is a nonconvex function of product quality.We show that the firm does not offer those qualities where the unit costis linear or exceeds its lower convex envelope. Consequently, there are&amp;quot;gaps&amp;quot; in its optimal quality choice. Second, when the firmcan offer only a limited number of quality levels (due to possible fixedcosts), we characterize the optimal location of these finitely manyquality levels. This characterization again has the property that noneof these qualities will lie within an interval where the unit cost islinear or exceeds its lower convex envelope. Several implications of theabove results are discussed.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14119">
    <title>Models of Customer Behavior: From Populations to Individuals</title>
    <link>http://hdl.handle.net/2451/14119</link>
    <description>Title: Models of Customer Behavior: From Populations to Individuals&lt;br/&gt;&lt;br/&gt;Jiang, Tianyi; Tuzhilin, Alex&lt;br/&gt;&lt;br/&gt;Abstract: There have been various claims made in the marketing community about thebenefits of 1-to-1 marketing versus traditional customer segmentationapproaches and how much they can improve understanding of customerbehavior. However, few rigorous studies exist that systematicallycompare these approaches. In this paper, we conducted such a systematicstudy and compared the performance of aggregate, segmentation, and1-to-1 marketing approaches across a broad range of experimentalsettings such as multiple segmentation levels, multiple real worldmarketing datasets, multiple dependent variables, different types ofclassifiers, different segmentation techniques, and different predictivemeasures. Our results show that, overall, 1-to-1 modeling significantlyoutperforms the aggregate approach among high-volume customers and isnever worse than aggregate approach among low-volume customers in ourexperimental settings. Moreover, the best segmentation techniques tendto outperform 1-to-1 modeling among low-volume customers.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/27716">
    <title>Modeling Volatility in Prediction Markets</title>
    <link>http://hdl.handle.net/2451/27716</link>
    <description>Title: Modeling Volatility in Prediction Markets&lt;br/&gt;&lt;br/&gt;Archak, Nikolay; Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: Nowadays, there is a significant experimental evidence of excellentex-post predictive accuracy in certain types of prediction markets, suchas markets for elections. This evidence shows that prediction marketsare efficient mechanisms for aggregating information and are moreaccurate in forecasting events than traditional forecasting methods,such as polls. Interpretation of prediction market prices asprobabilities has been extensively studied in the literature, howeverlittle attention so far has been given to understanding volatility ofprediction market prices. In this paper, we present a model of aprediction market with a binary payoff on a competitive event involvingtwo parties. In our model, each party has some underlying ``ability''process that describes its ability to win and evolves as an Itodiffusion. We show that if the prediction market for this event isefficient and accurate, the price of the corresponding contract willalso follow a diffusion and its instantaneous volatility is a particularfunction of the current claim price and its time to expiration. Wegeneralize our results to competitive events involving more than twoparties and show that volatilities of prediction market contracts forsuch events are again functions of the current claim prices and the timeto expiration, as well as of several additional parameters (ternarycorrelations of the underlying Brownian motions). In the experimentalsection, we validate our model on a set of InTrade prediction marketsand show that it is consistent with observed volatilities of contractreturns and outperforms the well-known GARCH model in predicting futurecontract volatility from historical price data. To demonstrate thepractical value of our model, we apply it to pricing options onprediction market contracts, such as those recently introduced byInTrade. Other potential applications of this model include detection ofsignificant market moves and improving forecast standard errors.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29885">
    <title>Modeling Dependency in Prediction Markets</title>
    <link>http://hdl.handle.net/2451/29885</link>
    <description>Title: Modeling Dependency in Prediction Markets&lt;br/&gt;&lt;br/&gt;Archak, Nikolay; Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: In the last decade, prediction markets became popular forecasting toolsin areas ranging from election results to movie revenues and Oscarnominations. One of the features that make prediction marketsparticularly attractive for decision support applications is that theycan be used to answer what-if questions and estimate probabilities ofcomplex  events. Traditional approach to answering such questionsinvolves running a combinatorial prediction market, what is not alwayspossible. In this paper, we present an alternative, statistical approachto pricing complex claims, which is based on analyzing co-movements ofprediction market prices for basis events. Experimental evaluation ofour technique on a collection of 51 InTrade contracts representing theDemocratic Party Nominee winning Electoral College Votes of a particularstate shows that the approach outperforms traditional forecastingmethods such as price and return regressions and can be used to extractmeaningful business intelligence from raw price data.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14801">
    <title>Modeling and Managing Changes in Text Databases</title>
    <link>http://hdl.handle.net/2451/14801</link>
    <description>Title: Modeling and Managing Changes in Text Databases&lt;br/&gt;&lt;br/&gt;Ipeirotis, Panagiotis G.; Ntoulas, Alexandros; Cho, Junghoo; Gravano, Luis&lt;br/&gt;&lt;br/&gt;Abstract: Large amounts of (often valuable) information are stored inweb-accessible text databases.  ``Metasearchers'' provide unifiedinterfaces to query multiple such databases at once. For efficiency,metasearchers rely on succinct statistical summaries of the databasecontents to select the best databases for each query.  So far, databaseselection research has largely assumed that databases are static, so theassociated statistical summaries do not need to change over time.However, databases are rarely static and the statistical summaries thatdescribe their contents need to be updated periodically to reflectcontent changes.  In this article, we first report the results of astudy showing how the content summaries of 152 real web databasesevolved over a period of 52 weeks.  Then, we show how to use ``survivalanalysis'' techniques in general, and Cox's proportional hazardsregression in particular, to model database changes over time andpredict when we should update each content summary.  Finally, we exploitour change model to devise update schedules that keep the summaries upto date by contacting databases only when needed, and then we evaluatethe quality of our schedules experimentally over real web databases.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14148">
    <title>Market Segmentation for Information Goods with Network Externalities</title>
    <link>http://hdl.handle.net/2451/14148</link>
    <description>Title: Market Segmentation for Information Goods with Network Externalities&lt;br/&gt;&lt;br/&gt;Jing, Bing&lt;br/&gt;&lt;br/&gt;Abstract: Positive externalities characterize the consumption of a majority ofinformation goods such as software, various Internet services, andonline communities. In a simple model of vertical differentiation, weshow that network externality is a critical factor for the versioning ofsuch information goods. In particular, a multi-product monopolist offerstwo versions of distinct qualities. The underlying rationale is thatoffering the low-end version expands the network size and thus enhancesthe (network) value of the high-end version, allowing the firm to chargea higher price for the high-end version. In addition, we show that thelow-quality version may be offered for free under very generalconditions. Competition between firms producing compatible productsreduces their incentive to version their products due to the spillovereffects in a shared product network.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14098">
    <title>Local Network Effects and Network Structure</title>
    <link>http://hdl.handle.net/2451/14098</link>
    <description>Title: Local Network Effects and Network Structure&lt;br/&gt;&lt;br/&gt;Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: This paper presents a model of local network effects in which agents ina social network each value the adoption of a product by a heterogeneoussubset of other agents in their &amp;quot;neighborhood&amp;quot;, and haveincomplete information about the structure and strength of adoptioncomplementarities between all other agents.  It shows that the symmetricBayes-Nash equilibria of a general adoption game are in monotonestrategies, can be strictly Pareto-ranked, and the greatest suchequilibrium is uniquely coalition-proof.  Each Bayes-Nash equilibriumhas a corresponding fulfilled-expectations equilibrium under whichagents form adoption expectations locally.  Examples analyze socialnetworks that are instances of a generalized random graph, and that arecomplete graphs (a standard model of network effects).  The structure ofthe network of adopting agents is characterized as a function of theequilibrium played, and empirical implications of this characterizationare discussed.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28084">
    <title>Limit Laws in Transaction-Level Asset Price Models</title>
    <link>http://hdl.handle.net/2451/28084</link>
    <description>Title: Limit Laws in Transaction-Level Asset Price Models&lt;br/&gt;&lt;br/&gt;Aue, Alexander; Horvath, Lajos; Hurvich, Clifford&lt;br/&gt;&lt;br/&gt;Abstract: We consider pure-jump transaction-level models for asset prices incontinuous time, driven by point processes. In a bivariate model thatadmits cointegration, we allow for time deformations to account for suche&amp;reg;ects as intraday seasonal patterns in volatility, and non-tradingperiods that may be di&amp;reg;erent for the two assets. Most assumptionsare stated directly on the point process, though we providesu&amp;plusmn;cient conditions on the corresponding inter-trade durationsfor these assumptions to hold. We obtain the asymptotic distribution ofthe log-price process. We also obtain the asymptotic distribution of theordinary least-squares estimator of the cointegrat- ing parameter basedon data sampled from an equally-spaced discretization of calendar time,in the case of weak fractional cointegration. Finally, we obtain thelimiting distribution of the ordinary least-squares estimator of theautoregressive parameter in a simpli&amp;macr;ed transaction-levelunivariate model with a unit root.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28089">
    <title>Leveraging aggregate ratings for improving predictive performance of
recommender systems</title>
    <link>http://hdl.handle.net/2451/28089</link>
    <description>Title: Leveraging aggregate ratings for improving predictive performance ofrecommender systems&lt;br/&gt;&lt;br/&gt;Umyarov, Akhmed; Tuzhilin, Alexander&lt;br/&gt;&lt;br/&gt;Abstract: This paper describes an approach for incorporating externally specifiedaggregate ratings information into certain types of recommender systems,including two types of collaborating filtering and a hierarchical linearregression model. First, we present a framework for incorporatingaggregate rating information and apply this framework to theaforementioned individual rating models. Then we formally show that thisadditional aggregate rating information provides more accuraterecommendations of individual items to individual users. Further, weexperimentally confirm this theoretical finding by demonstrating onseveral datasets that the aggregate rating information indeed leads tobetter predictions of unknown ratings. We also propose scalable methodsfor incorporating this aggregate information and test our approaches onlarge datasets. Finally, we demonstrate that the aggregate ratinginformation can also be used as a solution to the cold start problem ofrecommender systems.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/23402">
    <title>Leveraging Aggregate Ratings for Better Recommendations</title>
    <link>http://hdl.handle.net/2451/23402</link>
    <description>Title: Leveraging Aggregate Ratings for Better Recommendations&lt;br/&gt;&lt;br/&gt;Umyarov, Akhmed; Tuzhilin, Alexander&lt;br/&gt;&lt;br/&gt;Abstract: The paper presents a method that uses aggregate ratings provided byvarious segments of users for various categories of items to derivebetter estimations of unknown individual ratings. This is achieved byconverting the aggregate ratings into constraints on the parameters of arating estimation model presented in the paper. The paper alsodemonstrates theoretically that these additional constraints reducerating estimation errors resulting in better rating predictions.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14124">
    <title>IT Driven Automation: The New Wave</title>
    <link>http://hdl.handle.net/2451/14124</link>
    <description>Title: IT Driven Automation: The New Wave&lt;br/&gt;&lt;br/&gt;Tuzhilin, Alex&lt;br/&gt;&lt;br/&gt;Abstract: There has been much discussion in the press about productivityimprovements that grew at an annual rate of 3.55% from 2000 to 2003[BW04]. One of the sources of this productivity growth is automation. Wehave all witnessed numerous ways in which companies have automated theirbusiness processes over the past decade. As a recent example, The DallasMorning News reports in [Baj04] how Atmos Energy, the Dallas-based gascompany, is automating its gas meter reading capabilities by usingwireless technologies and thus reducing its staff by 225 employees overthe next five years. In this article, we will examine current trends inthe technology-driven automation and will argue that we are still in theearly stages of a new wave of automation that will profoundly affect theeconomy and will significantly contribute to the productivity growthover the next 10 &amp;acirc; 15 years. Industrial automation is an oldphenomenon that goes back to the Industrial Revolution when machinesreplaced physical labor on a massive scale. Automation profoundlyaffected manufacturing over the past 25 years when industrial robotsreplaced various manual jobs in different spheres of manufacturing,including automobiles, computers and telecommunication equipment. Morerecently, automation was primarily driven by IT. For example, toll boothcollectors recently became victims of IT-based automation when some ofthem lost their jobs to EZ-Pass technologies. Similarly, 225 employeesat Atmos Energy will lose their jobs within the next 5 years due to theadvancements in wireless technologies [Baj04]. Also, many cashiers indepartment stores and supermarkets will soon lose their jobs because ofthe advancements of the RFID tag technologies. Most of the jobs lost toautomation have been routine production jobs, according to the jobclassification proposed by Robert Reich in [Rei91]. The maincharacteristics of these jobs are repetitiveness and structurednesssince they have well defined procedural job descriptions. Examples ofthese jobs include assembly line workers, foremen, data processors, andtoll collectors. The routine production jobs have been replaced bymechanical, electrical and IT-driven machines, including industrialrobots and wireless communication devices. In this article, we claimthat the next waive of automation will affect not only routineproduction workers, but also what Reich calls symbolic-analytic workers[Rei91], such as engineers, office and knowledge workers, managers,educators, and other groups of &amp;acirc;mind workers.&amp;acirc; Althoughfew of these jobs will be eliminated completely, many of the moreroutine tasks in these jobs will be delegated to &amp;acirc;smartmachines&amp;acirc; within the next 10 &amp;acirc; 15 years, leading tomajor restructuring and consolidation of some of these jobs. Thisphenomenon is examined in the rest of this article.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14099">
    <title>Internet Exchanges for Used Books:  An Empirical Analysis of Welfare Implications</title>
    <link>http://hdl.handle.net/2451/14099</link>
    <description>Title: Internet Exchanges for Used Books:  An Empirical Analysis of Welfare Implications&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Smith, Michael D.; Telang, Rahul&lt;br/&gt;&lt;br/&gt;Abstract: Information technology-enabled exchanges have enhanced the viability ofa variety of secondary markets, notably markets for used books.Electronic used book exchanges, in particular, offer a wider selection,lower search costs, and significantly lower prices than physical usedbookstores do. The increased viability of these used book markets hascaused concern among groups such as the Book Publishers Association andAuthor&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;s Guildwho believe that used book markets will significantly cannibalize newbook sales.  This proposition, while theoretically possible, is based onspeculation as opposed to empirical evidence. In this research, we use aunique dataset collected fromAmazon.com&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;s newand used marketplaces to estimate the impact of IT-enabled used bookmarkets on new book sales. We use these data to calculate the impact ofthese secondary market exchanges on consumer and publisher welfare bycalculating the cross-price elasticity of new books sales with respectto used book prices.  Our analysis suggests that IT-enabled secondarymarket exchanges increase consumer surplus by approximately $70 millionannually. Further, we find that only 15% of used book sales at Amazoncannibalize new book purchases. The remaining 85% of used book salesapparently would not have occurred atAmazon&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;s new bookprices. This low cannibalization means that book publishers lose only$32 million in gross profit annually (about 0.2% of total gross profit)due to the presence ofAmazon&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;s usedbook markets. Further, the additional used book readership gain fromthese electronic markets may mitigate author losses through increasedrevenue from secondary sources such as speaking and licensing fees.These surplus changes, combined with the estimated $64 million the usedbook market added toAmazon&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;s grossprofits, show that IT-enabled used markets for books have a strongpositive first-order impact on total welfare.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14811">
    <title>Information Disclosure and Regulatory Compliance: Economic Issues and
Research Directions</title>
    <link>http://hdl.handle.net/2451/14811</link>
    <description>Title: Information Disclosure and Regulatory Compliance: Economic Issues andResearch Directions&lt;br/&gt;&lt;br/&gt;Ghose, Anindya&lt;br/&gt;&lt;br/&gt;Abstract: The Sarbanes Oxley Act (SOA) introduced significant changes to financialpractice and corporate governance regulation, including stringent newrules designed to protect investors by improving the accuracy andreliability of corporate disclosures. Briefly speaking, it requiresmanagement to submit a report containing an assessment of theeffectiveness of the internal control structure, a description ofmaterial weaknesses in such internal controls and of any materialnoncompliance. Such mandatory regulations can have some broaderramifications on firm profitability, market structure and socialwelfare, many of which were unintended when policy makers firstformulated this Act. Moreover, the tight coupling between complianceactivities, information disclosure and IT investments can haveimplications for IT governance because of its potential to changerelationships between technology investments and business. This articleaims to provide some intuitive insights into the trade-offs involved forfirms in disclosure of such information, and gives an overview of someresearch questions that would be of interest to academics, industryexecutives and policy makers alike.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14101">
    <title>Impact of Electronic Secondary Markets on Information Goods Suppliers</title>
    <link>http://hdl.handle.net/2451/14101</link>
    <description>Title: Impact of Electronic Secondary Markets on Information Goods Suppliers&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Telang, Rahul; Krishnan, Ramayya&lt;br/&gt;&lt;br/&gt;Abstract: We develop an analytical framework to investigate the competitiveimplications of electronic secondary markets which promote concurrentselling of new and used goods. In secondary markets where supplierscannot directly use second-hand goods for practicing inter-temporalprice discrimination, the threat of cannibalization of new goods by usedgoods become significant. We examine conditions under which it isoptimal for suppliers to operate in such markets, explaining why thesemarkets may not always be a threat to suppliers. Intuitively, secondarymarkets provide an active outlet for some consumers to sell theirsecond-hand (used) goods. Such sales lead to an increase in theirvaluation for the new good due to the potential for an increase in thedisposable income from resale. This increased valuation leads them to tobuy an additional new good. Thus the &amp;acirc;income effect&amp;acirc; canmitigate the losses incurred by suppliers from the direct&amp;acirc;cannibalization effect&amp;acirc; in the presence of secondarymarkets. We highlight the strategic role which used goods commission setby the retailer plays in determining profits for suppliers. Contrary toconventional wisdom, our model predicts the reduction in the price ofnew goods with an increase in the availability of used goods. Further,we show that as the used good price increases, the new good price alsoincreases. We conclude the paper by empirically testing someimplications of our model using data from the online book industry.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/25882">
    <title>Get Another Label? Improving Data Quality and Data Mining Using
Multiple, Noisy Labelers</title>
    <link>http://hdl.handle.net/2451/25882</link>
    <description>Title: Get Another Label? Improving Data Quality and Data Mining UsingMultiple, Noisy Labelers&lt;br/&gt;&lt;br/&gt;Sheng, Victor; Provost, Foster; Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: This paper addresses the repeated acquisition of labels for data itemswhen the labeling is imperfect.  We examine the improvement (or lackthereof) in data quality via repeated labeling, and focus especially onthe improvement of training labels for supervised induction. With theoutsourcing of small tasks becoming easier, for example via Rent-A-Coderor Amazon's Mechanical Turk, it often is possible to obtainless-than-expert labeling at low cost. With low-cost labeling, preparingthe unlabeled part of the data can become considerably more expensivethan labeling.  We present repeated-labeling strategies of increasingcomplexity, and show several main results. (i) Repeated-labeling canimprove label quality and model quality, but not always. (ii) Whenlabels are noisy, repeated labeling can be preferable to single labelingeven in the traditional setting where labels are not particularly cheap.(iii) As soon as the cost of processing the unlabeled data is not free,even the simple strategy of labeling everything multiple times can giveconsiderable advantage. (iv) Repeatedly labeling a carefully chosen setof points is generally preferable, and we present a robust techniquethat combines different notions of uncertainty to select data points forwhich quality should be improved. The bottom line: the results showclearly that when labeling is not perfect, selective acquisition ofmultiple labels is a strategy that data miners should have in theirrepertoire; for certain label-quality/cost regimes, the benefit is substantial.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14872">
    <title>Geography and Electronic Commerce: Measuring Convenience, Selection, and Price</title>
    <link>http://hdl.handle.net/2451/14872</link>
    <description>Title: Geography and Electronic Commerce: Measuring Convenience, Selection, and Price&lt;br/&gt;&lt;br/&gt;Forman, Chris; Ghose, Anindya; Goldfarb, Avi&lt;br/&gt;&lt;br/&gt;Abstract: We develop a formal model of online-offline substitution to identifythree factors that drive consumers to purchase online: convenience,selection, and price. This model builds hypotheses on how features ofoffline retail supply impact online purchasing. We then examine how thelocal availability of offline retail options drives use of the onlinechannel and consequently how the convenience, selection, and priceadvantages of the online channel may vary by geographic location. Inparticular, we examine the effect of local store openings on online bookpurchases in that location. We explore this problem using data fromAmazon on the top selling books for 1501 unique locations in the US for10 months ending in January 2006. In addition to this data, we useinformation on changes in local retail competition as measured byopenings of large specialty bookstores such as Borders or Barnes &amp;amp;Noble and discount stores such as Wal-Mart or Target. We show that evencontrolling for product-specific preferences by location, changes inlocal retail options have substantial effects on online purchases. Wedemonstrate how the convenience, selection, and price benefits of theInternet are different for customers in different types of locations.More generally, we show that geography significantly impacts the benefitthat consumers derive from electronic markets.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/31279">
    <title>Geo-Social Targeting for Privacy-friendly Mobile Advertising: Position Paper</title>
    <link>http://hdl.handle.net/2451/31279</link>
    <description>Title: Geo-Social Targeting for Privacy-friendly Mobile Advertising: Position Paper&lt;br/&gt;&lt;br/&gt;Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: This position paper is about methods for effective, privacy-friendlymobile advertising.  Specifically, we propose a new social-targetingdesign for using consumer location data from mobile devices (smartphones, smart pads, laptops, etc.) to target advertisements in a mannerthat is both effective and privacy friendly.  This paper introduces themain concepts, provides theoretical background and ties to theliterature, presents the design itself, and discusses applications.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14111">
    <title>Finite Brand Loyalty and Equilibrium Price Promotions</title>
    <link>http://hdl.handle.net/2451/14111</link>
    <description>Title: Finite Brand Loyalty and Equilibrium Price Promotions&lt;br/&gt;&lt;br/&gt;Jing, Bing; Wen, Zhong&lt;br/&gt;&lt;br/&gt;Abstract: The extant literature on price promotions typically assumes thatconsumers loyal to a brand never switch to a competing brand, withShilony (1977) and Raju et al (1990) being exceptions. Extending theNarasimhan (1988) model, we allow loyal consumers to hold finite brandloyalty. Our unique equilibrium splits into three types, depending uponconfigurations of consumer reservation utility, brand strength andswitcher population. The type of equilibrium for high brand loyaltycorresponds to the one in Narasimhan (1988). The remaining two types forintermediate and low brand loyalty demonstrate strikingly differentproperties. First, the strong brand has a higher price range and ahigher regular price. Second, the strong brand has a higher (lower)average promotional depth than the weak brand when the switcherpopulation is small (large). Third, both brands promote equallyfrequently when brand loyalty is relatively low. Therefore, thisanalysis hopefully provides a more complete picture about firms&amp;acirc;promotional decisions for all possible levels of brand loyalty andswitcher pupulation.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29918">
    <title>Explaining Documents' Classiﬁcations</title>
    <link>http://hdl.handle.net/2451/29918</link>
    <description>Title: Explaining Documents' Classiﬁcations&lt;br/&gt;&lt;br/&gt;Martens, David; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: This is a design-science paper about methods for explaining data-drivenclassiﬁcations of text documents. Document classiﬁcation has widespreadapplications, such as with web pages for advertising, emails for legaldiscovery, blog entries for sentiment analysis, and many more. Documentdata are characterized by very high dimensionality, often with tens ofthousands to millions of variables (words). Many applications requirehuman understanding of the reasons for classiﬁcation decisions: bymanagers, client-facing employees, and the technical team.Unfortunately, due to the high dimensionality, understanding thedecisions made by the document classiﬁers is very difficult. Previousapproaches to gain insight into black-box models do not deal well withhigh-dimensional data. Our main theoretical contribution is to deﬁne anew sort of explanation, tailored to the business needs of documentclassiﬁcation and able to cope with the associated technicalconstraints. Speciﬁcally, an explanation is deﬁned as a set of words(terms, more generally) such that removing all words within this setfrom the document changes the predicted class from the class ofinterest. We present an algorithm to ﬁnd such explanations, as well as aframework to assess such an algorithm's performance. We demonstrate thevalue of the new approach with a case study from a real-world documentclassiﬁcation task: classifying web pages as containing adult content,with the goal of allowing advertisers to choose not to have their adsappear there. We present a further empirical demonstration on news-storytopic classiﬁcation using the 20 Newsgroups benchmark dataset. Theresults show the explanations to be concise and document-speciﬁc, and toprovide insight into the exact reasons for the classiﬁcation decisions,into the workings of the classiﬁcation models, and into the businessapplication itself. We also illustrate how explaining documents'classiﬁcations can help to improve data quality and model performance.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14114">
    <title>Evaluating Pricing and Product Line Strategy Using eCommerce Data:
Evidence and Estimation Challenges</title>
    <link>http://hdl.handle.net/2451/14114</link>
    <description>Title: Evaluating Pricing and Product Line Strategy Using eCommerce Data:Evidence and Estimation Challenges&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: As Internet-based commerce becomes increasingly widespread, large datasets about the demand for and pricing of a wide variety of productsbecome available. These present exciting new opportunities for empiricaleconomic and business research, but also raise new statistical issuesand challenges. In this article, we summarize a program of research thataims to assess the optimality of price discrimination in the softwareindustry using a large ecommerce panel data set gathered fromAmazon.com. We describe the the key parameters relating to demand andcost that must be reliably estimated in order to successfully accomplishthis research, and outline our approach to estimating these parameters.This includes a method for &amp;quot;reverse engineering&amp;quot; actual demandlevels from the sales ranks reported by Amazon, and approaches toestimating demand elasticity and variable costs directly from publiclyavailable ecommerce data. Our analysis raises many new challenges to thereliable statistical analysis of ecommerce data, and we conclude with abrief summary of some salient ones.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/27680">
    <title>Estimating the Socio-Economic Impact of Product Reviews: Mining Text and
Reviewer Characteristics</title>
    <link>http://hdl.handle.net/2451/27680</link>
    <description>Title: Estimating the Socio-Economic Impact of Product Reviews: Mining Text andReviewer Characteristics&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: With the rapid growth of the Internet, the ability of users to createand publish content has created active electronic communities thatprovide a wealth of product information. However, the high volume ofreviews that are typically published for a single product makes harderfor individuals as well as manufacturers to locate the best reviews andunderstand the true underlying quality of a product. In this paper, were-examine the impact of reviews on economic outcomes like product salesand see how different factors affect social outcomes like the extent oftheir perceived usefulness. Our approach explores multiple aspects ofreview text, such as lexical, grammatical, semantic, and stylisticlevels to identify important text-based features. In addition, we alsoexamine multiple reviewer-level features such as average usefulness ofpast reviews and the self-disclosed identity measures of reviewers thatare displayed next to a review. Our econometric analysis reveals thatthe extent of subjectivity, informativeness, readability, and linguisticcorrectness in reviews matters in influencing sales and perceivedusefulness. Reviews that have a mixture of objective, and highlysubjective sentences have a negative effect on product sales, comparedto reviews that tend to include only subjective or only objectiveinformation. However, such reviews are considered more informative (orhelpful) by the users. By using Random Forest based classifiers, we showthat we can accurately predict the impact of reviews on sales and theirperceived usefulness. Reviews for products that have received widelyfluctuating reviews, also have reviews of widely fluctuatinghelpfulness. In particular, we find that highly detailed and readablereviews can have low helpfulness votes in cases when users tend to votenegatively not because they disapprove of the review quality but ratherto convey their disapproval of the review polarity. We examine therelative importance of the three broad feature categories:`reviewer-related' features, `review subjectivity' features, and `reviewreadability' features, and find that using any of the three feature setsresults in a statistically equivalent performance as in the case ofusing all available features. This paper is the first study thatintegrates econometric, text mining, and predictive modeling techniquestoward a more complete analysis of the information captured byuser-generated online reviews in order to estimate their socio-economicimpact. Our results can have implications for judicious design ofopinion forums.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14113">
    <title>Electronic Markets, Search Costs and Firm Boundaries</title>
    <link>http://hdl.handle.net/2451/14113</link>
    <description>Title: Electronic Markets, Search Costs and Firm Boundaries&lt;br/&gt;&lt;br/&gt;Sankaranarayanan, Ramesh; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: We study how electronic markets that facilitate broader inter-firmtransactions affect the vertical scope of emerging IT-enabled extendedenterprises. We do so by modeling firms in a three-tier value chain whoare each connected to a common electronic market that facilitates directbusiness transactions across tiers, and that lowers the search costsassociated with finding an appropriate trading partner for each of them.The extent to which search costs are reduced depends on the complexityof B2B search, and the nature of the supporting technologies that theelectronic market facilitates. Variation in search costs affect firmsacross the value chain in three key ways: by a change in the transactioncosts of interaction between firms; by a change in the contracting costsassociated with outsourcing owing to changes in the costs of moralhazard for delegated search, and by a change in the price dispersion ofupstream input commodities. We capture each of these effects in a newmodel that integrates search theory into the principal-agent framework,and establish that the optimal outsourcing contract has a simple&amp;quot;all or nothing&amp;quot; performance-based structure under fairlygeneral assumptions. We then apply this model to contrast the effectthat different information technologies have on the relative B2B searchcosts of different firms in the value chain, contrasting the predictedchanges of proportionate, constant and convergent changes in searchcosts. When integrated with a detailed analysis of the nature of B2Bsearch, these results predicts that when B2B search isinformation-intensive, electronic markets will facilitate an increase inoutsourcing, market-based transactions and a reduction in the verticalscope of extended enterprises. In contrast, when B2B search is primarilycommunication-intensive, electronic markets will lead to tighterintegration and an increase in the vertical scope of the extendedenterprise. Our research suggest that the nature of the informationtechnologies and of the business activities supported by an electronicmarket are crucial determinants of the organizational and industrychanges they induce, and our results have important implications for avariety of industries in which both technological and agency issues willinfluence the eventual success of global IT-facilitated extendedenterprise initiatives.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29649">
    <title>Dynamic Pricing of Network Goods with Boundedly Rational Consumers</title>
    <link>http://hdl.handle.net/2451/29649</link>
    <description>Title: Dynamic Pricing of Network Goods with Boundedly Rational Consumers&lt;br/&gt;&lt;br/&gt;Radner, Roy; Radunskaya, Ami; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: We present a model of dynamic monopoly pricing for a good that displaysnetwork effects. In contrast with the standard notion of arational-expectations equilibrium, we model consumers as boundedlyrational, and unable either to pay immediate attention to each pricechange, or to make accurate forecasts of the adoption of the networkgood. Our analysis shows that the seller's optimal price  trajectory hasthe following structure: the price is low when the user base is below atarget level, is high when the user base is above the target, and is setto keep user base stationary once the target level has been attained. Weshow that this pricing policy is robust to a number of extensions, whichinclude the product's user base evolving over time, and consumers basingtheir choices on a mixture of a myopic and a &amp;quot;stubborn&amp;quot;expectation of adoption. Our results differ significantly from thosethat would be predicted by a model based on rational-expectationsequilibrium, and are more consistent with the pricing of network goodsobserved in practice.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14104">
    <title>Dynamic Pricing of Network Goods with Boundedly Rational Consumers</title>
    <link>http://hdl.handle.net/2451/14104</link>
    <description>Title: Dynamic Pricing of Network Goods with Boundedly Rational Consumers&lt;br/&gt;&lt;br/&gt;Radner, Roy; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: An important simplifying assumption made when analyzing goods thatdisplay positive network effects is that potential consumers can form arational expectation of the equilibrium demand for the good, and thatthey all form the same expectation, which is then fulfilled based ontheir consumption choices - sometimes called a rational expectationsequilibrium (REE). We examine whether the results of these models arerobust to the relaxation of this assumption. In our model, consumersdiffer in their marginal utility of total demand (intensity of thenetwork effect), which varies according to a given distribution (thedistribution of consumer &amp;quot;types&amp;quot;), and are boundedly rationalin two ways. First, only a fraction of consumers &amp;quot;payattention&amp;quot; to price announcements in any interval of time. Second,those consumers who pay attention make their consumption choices basedon a boundedly rational expectation of future demand. Our benchmarkmodel is of myopic expectations, although we show how our resultsgeneralize (1) to a case in which the population of consumers containsboth those who are myopic and those who are &amp;quot;fully rational,&amp;quot;and (2) to a case in which consumers have expectations that are partly&amp;quot;stubborn&amp;quot;. We base our analysis on a continuous-timeapproximation of an underlying discrete-time model. Under thisapproximation, the instantaneous choices of consumers continuouslyinfluence the rate at which demand adjusts over time, and a monopolistchooses a price trajectory to maximize profit. First, we show that,under fairly general assumptions about the distribution of types, theprofit-maximizing rational expectations equilibrium is not a steadystate of the optimal trajectory with boundedly rational consumers. Oursecond theorem shows that if consumer types are uniformly distributedand consumers form myopic (or more rational) expectations, themonopolist&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;soptimal pricing trajectory is generated by a &amp;quot;target policy&amp;quot;with the following properties: when current demand is below the target,the price is zero; when current demand is above the target, the price isthe maximum possible, and when current demand is at the target, theprice is chosen to keep demand stationary. We also show that the optimaldemand target with boundedly rational consumers is always strictly lowerthan the equilibrium level of demand predicted by a model with rationalexpectations. Furthermore, the difference between the target demand andthe rational expectations demand is higher when consumers pay attentionto themonopolist&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;sprice announcements at a lower rate. We generalize the results from thisexample in two ways. Our third theorem examines the case of myopicconsumers and strictly concave distributions of consumer types. To findan optimal policy one must expand the set of controls to includemeasure-valued controls. The optimal policy is similar to the targetpolicy of Theorem 2, except that when current demand is at the target,the monopolist chooses the &amp;quot;mixture&amp;quot; between a price of zeroand the maximum possible price that keeps demand stationary. For convexconsumer type distributions in the neighborhood of the uniformdistribution, we give a heuristic argument to support a conjecture thatthe monopolist continues to choose a demand target lower than therational expectations demand, but varies price gradually in theneighborhood of the demand target. Finally, for uniformly distributedtypes and consumer expectations that are both myopic and&amp;quot;stubborn&amp;quot;, we show that themonopolist&amp;Atilde;&amp;Acirc;&amp;cent;&amp;Atilde;&amp;Acirc;&amp;Atilde;&amp;Acirc;soptimal pricing trajectory is generated by a target policy with the sameproperties as those in Theorem 2, although with a target that isstrictly lower, and that increases as consumers become progressivelyless stubborn. (This paper is part of a program of research whose broadobjective is to explore the conditions under which the assumption ofunbounded rationality in economic models is a reasonable one).</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14758">
    <title>Dynamic Pricing of Network Goods with Boundedly Rational Consumers</title>
    <link>http://hdl.handle.net/2451/14758</link>
    <description>Title: Dynamic Pricing of Network Goods with Boundedly Rational Consumers&lt;br/&gt;&lt;br/&gt;Radner, Roy; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: We present a model of dynamic monopoly pricing for a good that displaysnetwork effects. In contrast with the standard notion of arational-expectations equilibrium, we model consumers as boundedlyrational, and unable either to pay immediate attention to each pricechange, or to make accurate forecasts of the adoption of the networkgood. Our analysis shows that the seller&amp;Atilde;&amp;cent;&amp;Acirc;&amp;Acirc;soptimal price trajectory has the following simple structure: the priceis zero when the product user base is below a specific threshold, and ischosen to keep user base stationary once this threshold demand level hasbeen attained. We show that our prescribed pricing policy is robust to anumber of extensions, which include theproduct&amp;Atilde;&amp;cent;&amp;Acirc;&amp;Acirc;s user base evolving over time, afraction of consumers being sufficiently rational to make accurateadoption forecasts, and consumers basing their choices on a mixture of amyopic and a &amp;quot;stubborn&amp;quot; expectation of adoption. Our resultsdiffer significantly from those that would be predicted by a model basedon rational-expectations equilibrium, and are more consistent with thepricing of network goods observed in practice.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14760">
    <title>Duplicate Record Detection: A Survey</title>
    <link>http://hdl.handle.net/2451/14760</link>
    <description>Title: Duplicate Record Detection: A Survey&lt;br/&gt;&lt;br/&gt;Elmagarmid, Ahmed; Ipeirotis, Panagiotis G.; Verykios, Vassilios&lt;br/&gt;&lt;br/&gt;Abstract: Often, in the real world, entities have two or more representations indatabases. Duplicate records do not share a common key and/or theycontain errors that make duplicate matching a difficult task. Errors areintroduced as the result of transcription errors, incompleteinformation, lack of standard formats or any combination of thesefactors. In this article, we present a thorough analysis of theliterature on duplicate record detection. We cover similarity metricsthat are commonly used to detect similar field entries, and we presentan extensive set of duplicate detection algorithms that can detectapproximately duplicate records in a database. We also cover multipletechniques for improving the efficiency and scalability of approximateduplicate detection algorithms. We conclude with a coverage of existingtools and with a brief discussion of the big open problems in the area.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14808">
    <title>Does IT Matter in Business Education? Interviews with Business School Deans</title>
    <link>http://hdl.handle.net/2451/14808</link>
    <description>Title: Does IT Matter in Business Education? Interviews with Business School Deans&lt;br/&gt;&lt;br/&gt;Dhar, Vasant; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: How are business schools thinking about developing leaders for theemerging digital economy? To answer this question, we interviewed 45business school deans about whether knowledge about IT in businessshould be a part of core MBA education, and if so, how this knowledgeshould be delivered. A majority of deans recognize the importance of ITin business and the need for its presence in a forward looking corebusiness curriculum that is training managers for an increasingly globaland information rich future. There are three themes around which such apresence is described by them: understanding how the transformative andwealth generating potential of IT changes business and society,understanding how to make successful IT investment decisions, andfacilitating innovation and creativity in the use of increasinglyavailable data for decision making. However, a significant fraction ofthese deans struggle with the delivery of IT content in their corecurriculum, and there is a clear divergence between the extent to whichbusiness school leadership considers IT in business important, and itsrealized presence in core MBA education. We identify the main reasonsthat contribute towards this divergence and how some schools areaddressing it. Based on our findings, we outline the business importanceand intellectual foundations for a natural question around which coreeducation about IT in business can be structured, which asks &amp;quot;Howdoes IT transform business and society?&amp;quot;</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/23783">
    <title>Does Chatter Matter?  The Impact of User-Generated Content on Music Sales</title>
    <link>http://hdl.handle.net/2451/23783</link>
    <description>Title: Does Chatter Matter?  The Impact of User-Generated Content on Music Sales&lt;br/&gt;&lt;br/&gt;Dhar, Vasant; Chang, Elaine&lt;br/&gt;&lt;br/&gt;Abstract: The Internet has enabled the era of user-generated content, potentiallybreaking the hegemony of traditional content generators as the primarysources of &amp;ldquo;legitimate&amp;rdquo; information. Prime examples ofuser-generated content are blogs and social networking sites, whichallow easy publishing of and access to information. In this study, weexamine the usefulness of such content, consisting of data from blogsand social networking sites in predicting sales in the music industry.We track the changes in online chatter for a sample of 108 albums forfour weeks before and after their release dates. We use linear andnonlinear regression to identify the relative significance of onlinevariables on their observation date in predicting future album unitsales two weeks ahead Our findings are as follows: (a) the volume ofblog posts about an album is positively correlated with future sales,(b) greater increases in an artist&amp;rsquo;s Myspace friends week overweek have a weaker correlation to higher future sales, (c) traditionalfactors are still relevant &amp;ndash; albums released by major labels andalbums with a number of reviews from mainstream sources like RollingStone also tended to have higher future sales. More generally, the studyprovides some preliminary answers for marketing managers interested inassessing the relative importance of the burgeoning number of &amp;ldquo;Web2.0&amp;rdquo; information metrics that are becoming available on theInternet, and how looking at interactions among them could providepredictive value beyond viewing them in isolation. The study alsoprovides a framework for thinking about when user-generated contentinfluences decision making.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29941">
    <title>Digital Access, Political Networks and the Diffusion of Democracy</title>
    <link>http://hdl.handle.net/2451/29941</link>
    <description>Title: Digital Access, Political Networks and the Diffusion of Democracy&lt;br/&gt;&lt;br/&gt;Rhue, Lauren; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: We examine the effects of digital access on the prevalence of democracyand its diffusion via geographical and trade networks across 152countries between 2000 and 2008. Although civil liberties and mediafreedom show a consistently positive relationship with different formsof digital access, our dynamic models that allow co-evolution of digitalaccess, democracy and trade tie formation suggest that high mobilepenetration has a more significant impact on civil liberties thanInternet access does, and may also increase a country's&amp;quot;susceptibility&amp;quot; to democratic changes in neighboring nations.We explore possible drivers of these empirical findings, discussing somesocial and political implications.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/23604">
    <title>Deriving the Pricing Power of Product Features by Mining Consumer Reviews</title>
    <link>http://hdl.handle.net/2451/23604</link>
    <description>Title: Deriving the Pricing Power of Product Features by Mining Consumer Reviews&lt;br/&gt;&lt;br/&gt;Archak, Nikolay; Ghose, Anindya; Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: The growing pervasiveness of the Internet has changed the way thatconsumers shop for goods. Increasingly, user-generated product reviewsserve as a valuable source of information for customers making productchoices online. While there is a significant body of theory onmulti-attribute choice under uncertainty, the literature that examinesproduct reviews has not built on this stream of theory for a variety ofreasons. Typically, the impact of product reviews has been incorporatedby numeric variables representing the valence and volume of reviews. Inthis paper we posit that the information embedded in product reviewscannot be captured by a single scalar value. Rather, we argue thatproduct reviews are multifaceted and hence, the textual content ofproduct reviews is an important determinant of consumers' choices, overand above the valence and volume of reviews. We provide a text miningtechnique that allows us to incorporate text in choice and panel datamodels by decomposing textual reviews into segments, evaluatingdifferent product features. We test our approach on a unique datasetcollected from Amazon, and demonstrate how it can be used to learnconsumers' relative preferences for different product features. Thedataset used contains three different groups of products (digitalcameras, camcorders, PDAs), associated sales data and consumer reviewdata gathered over a 15-month period. Additionally, we present anddiscuss two experimental techniques that can be used to alleviate theproblem of data sparsity and of omitted variables: the first techniquemodels consumer opinions as elements of a tensor product of independentfeature and evaluation spaces and the second technique clusters rareopinions based on pointwise mutual information. The paper concludes bydiscussing the managerial relevance of this work as a tool forextracting actionable business intelligence from user-generated content.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29585">
    <title>Demographics of Mechanical Turk</title>
    <link>http://hdl.handle.net/2451/29585</link>
    <description>Title: Demographics of Mechanical Turk&lt;br/&gt;&lt;br/&gt;Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: We present the results of a survey that collected information about thedemographics of participants on Amazon Mechanical Turk, together withinformation about their level of activity and motivation for working onAmazon Mechanical Turk. We find that approximately 50% of the workerscome from the United States and 40% come from India. Country of origintends to change the motivating reasons for workers to participate in themarketplace. Significantly more workers from India participate onMechanical Turk because the online marketplace is a primary source ofincome, while in the US most workers consider Mechanical Turk asecondary source of income. While money is a primary motivating reasonfor workers to participate in the marketplace, workers also cite avariety of other motivating reasons, including entertainment and education.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/31553">
    <title>Data Science and Prediction</title>
    <link>http://hdl.handle.net/2451/31553</link>
    <description>Title: Data Science and Prediction&lt;br/&gt;&lt;br/&gt;Dhar, Vasant&lt;br/&gt;&lt;br/&gt;Abstract: The world's data is growing more than 40% annually. Coupled withexponentially growing computing horsepower, this provides us withunprecedented basis for 'learning' useful things from the data throughstatistical induction without material human intervention and acting onthem. Philosophers have long debated the merits and demerits ofinduction as a scientific method, the latter being that conclusions arenot guaranteed to be certain and that multiple and numerous models canbe conjured to explain the observed data. I propose that 'big data'brings a new and important perspective to these problems in that itgreatly ameliorates historical concerns about induction, especially ifour primary objective is prediction as opposed to causal modelidentification.  Equally significantly, it propels us into an era ofautomated decision making, where computers will make the bulk ofdecisions because it is infeasible or more costly for humans to do so.In this paper, I describe how scale, integration and most importantly,prediction will be distinguishing hallmarks in this coming era of DataScience.'  In this brief monograph, I define this newly emerging fieldfrom business and research perspectives.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14116">
    <title>Confidence Bands for Roc Curves</title>
    <link>http://hdl.handle.net/2451/14116</link>
    <description>Title: Confidence Bands for Roc Curves&lt;br/&gt;&lt;br/&gt;Macskassy, Sofus; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: In this paper we study techniques for generating and evaluatingconfidence bands on ROC curves. ROC curve evaluation is rapidly becominga commonly used evaluation metric in machine learning, althoughevaluating ROC curves has thus far been limited to studying the areaunder the curve (AUC) or generation of one-dimensional confidenceintervals by freezing one variable&amp;acirc; the false-positive rate, orthreshold on the classification scoring function. Researchers in themedical field have long been using ROC curves and have many well-studiedmethods for analyzing such curves, including generating confidenceintervals as well as simultaneous confidence bands. In this paper weintroduce these techniques to the machine learning community and showtheir empirical fitness on the Covertype data set&amp;acirc;a standardmachine learning benchmark from the UCI repository. We show how some ofthese methods work remarkably well, others are too loose, and thatexisting machine learning methods for generation of 1-dimensionalconfidence intervals do not translate well to generation of simultaneousbands&amp;acirc;their bands are too tight.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28092">
    <title>Conditions for the Propagation of Memory Parameter from Durations to
Counts and Realized Volatility</title>
    <link>http://hdl.handle.net/2451/28092</link>
    <description>Title: Conditions for the Propagation of Memory Parameter from Durations toCounts and Realized Volatility&lt;br/&gt;&lt;br/&gt;Deo, Rohit; Hurvich, Clifford M.; Soulier, Philippe; Wang, Yi&lt;br/&gt;&lt;br/&gt;Abstract: We establish sufficient conditions on durations that are stationary withfinite variance and memory parameter $d \in [0,1/2)$ to ensure that thecorresponding counting process $N(t)$ satisfies $Var N(t) \sim Ct^{2d+1}$ ($C&amp;gt;0$) as $t \rightarrow \infty$, with the same memoryparameter $d \in [0,1/2)$ that was assumed for the durations. Thus,these conditions ensure that the memory parameter in durationspropagates to the same memory parameter in the counts. We then show thatany Autoregressive Conditional Duration ACD(1,1) model with a sufficientnumber of finite moments yields short memory in counts, while any LongMemory Stochastic Duration model with $d&amp;gt;0$ and all finite momentsyields long memory in counts, with the same $d$. Finally, we providesome results about the propagation of long memory to theempirically-relevant case of realized variance estimates affected bymarket microstructure noise contamination.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/31303">
    <title>Comparative Effectiveness for Oral Anti-diabetic Treatments among Newly
Diagnosed Type-2 Diabetics: Machine Learning Applied to a Large-Scale
Claims Dataset</title>
    <link>http://hdl.handle.net/2451/31303</link>
    <description>Title: Comparative Effectiveness for Oral Anti-diabetic Treatments among NewlyDiagnosed Type-2 Diabetics: Machine Learning Applied to a Large-ScaleClaims Dataset&lt;br/&gt;&lt;br/&gt;Maguire, Jon; Dhar, Vasant&lt;br/&gt;&lt;br/&gt;Abstract: In this paper, we demonstrate how the US healthcare system can provideincreased benefits per unit of spend, and earlier identification of andintervention in chronic diseases through better predictive data-basedanalytics applied to the increasingly available troves of healthcareclaims data. Specifically, we demonstrate the effectiveness of datamining by applying machine learning methods to large-scale medical andpharmacy claims data for roughly 70,000 patients over six years on newlydiagnosed with type-2 diabetes, a common disease in the US costingbillions to treat. This analysis reveals important differences in costand quality among the disease's common treatments some of which havebeen published in the American Diabetes Association, and others that areregarded as tentative or have not been considered at all. The studydemonstrates the potential for using large scale data mining for betterunderstanding other major diseases including coronary problems andcancers and for focusing further inquiry in these areas.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/30284">
    <title>Comments on 'Protecting Consumer Privacy in an Era of Rapid Change: A
Proposed Framework for Businesses and Policymakers'</title>
    <link>http://hdl.handle.net/2451/30284</link>
    <description>Title: Comments on 'Protecting Consumer Privacy in an Era of Rapid Change: AProposed Framework for Businesses and Policymakers'&lt;br/&gt;&lt;br/&gt;Dhar, Vasant; Hsieh, Jessy; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: The purpose of this document is to respond to selected questions forcomment on the proposed framework in the FTC report 'Protecting ConsumerPrivacy in an Era of Rapid Change: A Proposed Framework for Businessesand Policymakers' (December 1st, 2010). Our responses are based on ourongoing research about online privacy and data risk at NYU SternSchool's Center for Digital Economy Research. Our findings are describedfurther in Dhar, Hsieh and Sundararajan (2011).</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/15026">
    <title>Collective Inference for Consumer Networks</title>
    <link>http://hdl.handle.net/2451/15026</link>
    <description>Title: Collective Inference for Consumer Networks&lt;br/&gt;&lt;br/&gt;Hill, Shawndra; Provost, Foster; Volinsky, Chris</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14150">
    <title>Collaborating on Multi-party Information Systems Development Projects: A
Collective Reflection-in-Action View</title>
    <link>http://hdl.handle.net/2451/14150</link>
    <description>Title: Collaborating on Multi-party Information Systems Development Projects: ACollective Reflection-in-Action View&lt;br/&gt;&lt;br/&gt;Levina, Natalia&lt;br/&gt;&lt;br/&gt;Abstract: Growth of business-to-consumer (B2C) applications such as electronicstorefronts, catalogues, and customer support websites has drawn a greatnumber of diverse stakeholders into the IS Development (ISD) practice.Marketing, strategy, and graphic design specialists have joined avariety of technical professionals and business stakeholders indeveloping B2C applications. Oftentimes, these professionals work fordifferent organizations with different histories, cultures, and rewardstructures. A longitudinal qualitative field study of a B2C applicationdevelopment project was undertaken in order to build an in-depthunderstanding of the collaborative practices of diverse professionals inISD projects. The paper proposes that the multi-party collaborativepractice can be understood as a &amp;acirc;collectivereflection-in-action&amp;acirc; cycle through which an IS design emergesas a result of agents producing, sharing, and reflecting upon materialobjects. Agents from diverse backgrounds exert different influences overemergent designs depending on their organization, profession, andproject involvement-based power relations. These power relations shapewhether collaborators &amp;acirc;add to&amp;acirc; &amp;acirc;ignore,&amp;acirc;or &amp;acirc;challenge&amp;acirc; the work produced by others. In turn,agents&amp;acirc; actions either reinforce or transform existing powerrelations depending on who gets to claim credit for and ownership of theemergent design. Implications for the study of boundary objects, teamdiversity, organizational learning, and contemporary ISD are drawn.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14123">
    <title>Collaborating on Multi-party Information Systems Development Projects: A
Collective Reflection-in-Action View</title>
    <link>http://hdl.handle.net/2451/14123</link>
    <description>Title: Collaborating on Multi-party Information Systems Development Projects: ACollective Reflection-in-Action View&lt;br/&gt;&lt;br/&gt;Levina, Natalia&lt;br/&gt;&lt;br/&gt;Abstract: Growth of business-to-consumer (B2C) applications such as electronicstorefronts, catalogues, and customer support websites has drawn a greatnumber of diverse stakeholders into the IS Development (ISD) practice.Marketing, strategy, and graphic design specialists have joined avariety of technical professionals and business stakeholders indeveloping B2C applications. Oftentimes, these professionals work fordifferent organizations with different histories, cultures, and rewardstructures. A longitudinal qualitative field study of a B2C applicationdevelopment project was undertaken in order to build an in-depthunderstanding of the collaborative practices of diverse professionals inISD projects. The paper proposes that the multi-party collaborativepractice can be understood as a &amp;acirc;collectivereflection-in-action&amp;acirc; cycle through which an IS design emergesas a result of agents producing, sharing, and reflecting upon materialobjects. Agents from diverse backgrounds exert different influences overemergent designs depending on their organization, profession, andproject involvement-based power relations. These power relations shapewhether collaborators &amp;acirc;add to&amp;acirc; &amp;acirc;ignore,&amp;acirc;or &amp;acirc;challenge&amp;acirc; the work produced by others. In turn,agents&amp;acirc; actions either reinforce or transform existing powerrelations depending on who gets to claim credit for and ownership of theemergent design. Implications for the study of boundary objects, teamdiversity, organizational learning, and contemporary ISD are drawn.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14759">
    <title>Classification-Aware Hidden-Web Text Database Selection</title>
    <link>http://hdl.handle.net/2451/14759</link>
    <description>Title: Classification-Aware Hidden-Web Text Database Selection&lt;br/&gt;&lt;br/&gt;Ipeirotis, Panagiotis G.; Gravano, Luis&lt;br/&gt;&lt;br/&gt;Abstract: Many valuable text databases on the web have non-crawlable contents thatare ``hidden'' behind search interfaces. Metasearchers are helpful toolsfor searching over multiple such ``hidden-web'' text  databases at oncethrough a unified query interface. An important step in themetasearching process is database selection, or determining whichdatabases are the most relevant for a given user query. Thestate-of-the-art database selection techniques rely on  statisticalsummaries of the database contents, generally including the databasevocabulary and the associated word frequencies. Unfortunately,hidden-web text databases typically do not export such summaries, soprevious research has developed algorithms for constructing approximatecontent summaries from document samples extracted from the databases viaquerying. We present a novel ``focused probing'' sampling algorithm thatdetects the topics covered in a database and adaptively extractsdocuments that are representative of the topic coverage of the database.Our algorithm is the first that constructs content summaries thatinclude the frequencies of the words in the database. Unfortunately,Zipf's law practically guarantees that, for any relatively largedatabase, content summaries built from moderately sized document sampleswill fail to cover many low-frequency words; in turn, incomplete contentsummaries might negatively affect the  database selection process,especially for short queries with infrequent words. To enhance thesparse document samples and improve the database selection decisions, weexploit the fact that topically similar databases tend to have similarvocabularies, so samples extracted from databases with a similar topicalfocus can complement each other. We have developed two databaseselection algorithms that exploit this observation. The first algorithmproceeds hierarchically and selects the best category for a query, andthen sends the query to the appropriate databases in the chosencategory. The second  algorithm uses ``shrinkage,'' a statisticaltechnique for improving parameter estimation in the face of sparse data,to enhance the database content summaries with category-specific words.We describe how to modify existing database selection algorithms toadaptively decide --at run-time-- whether shrinkage is beneficial for aquery. A thorough evaluation over a variety of databases, including 315real web databases as well as TREC data, suggests that the proposedsampling methods generate high-quality content summaries and thedatabase selection algorithms produce significantly more relevantdatabase selection decisions and overall search results than existing algorithms.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14122">
    <title>Classification in Networked Data:  A Toolkit and a Univariate Case Study</title>
    <link>http://hdl.handle.net/2451/14122</link>
    <description>Title: Classification in Networked Data:  A Toolkit and a Univariate Case Study&lt;br/&gt;&lt;br/&gt;Macskassy, Sofus; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: This paper presents NetKit, a modular toolkit for classification innetworked data, and a case-study of its application to a collection ofnetworked data sets used in prior machine learning research. Networkeddata are relational data where entities are interconnected, and thispaper considers the common case where entities whose labels are to beestimated are linked to entities for which the label is known. NetKit isbased on a three-component framework, comprising a local classifier, arelational classifier, and a collective inference procedure. Variousexisting relational learning algorithms can be instantiated withappropriate choices for these three components and new relationallearning algorithms can be composed by new combinations of components.The case study demonstrates how the toolkit facilitates comparison ofdifferent learning methods (which so far has been lacking in machinelearning research). It also shows how the modular framework allowsanalysis of subcomponents, to assess which, whether, and when particularcomponents contribute to superior performance. The case study focuses onthe simple but important special case of univariate networkclassification, for which the only information available is thestructure of class linkage in the network (i.e., only links and someclass labels are available). To our knowledge, no work previously hasevaluated systematically the power of class-linkage alone forclassification in machine learning benchmark data sets. The resultsdemonstrate clearly that simple network-classification models performremarkably well&amp;acirc;well enough that they should be used regularlyas baseline classifiers for studies of relational learning for networkeddata. The results also show that there are a small number of componentcombinations that excel, and that different components are preferable indifferent situations, for example when few versus many labels are known.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14110">
    <title>Building and Querying Large Modelbases</title>
    <link>http://hdl.handle.net/2451/14110</link>
    <description>Title: Building and Querying Large Modelbases&lt;br/&gt;&lt;br/&gt;Tuzhilin, Alexander; Liu, Bing; Hu, Jie&lt;br/&gt;&lt;br/&gt;Abstract: Model building is one of the most important objectives of data miningand data analysis. As many data mining applications, such aspersonalization, bioinformatics and some large enterprise-wide businessapplications, become increasingly complex and require a very largenumber of models, it is becoming progressively more difficult for dataanalysts to built and to manage a large number of models in theseapplications on their own. Therefore, development of software toolshelping data analysts in these tasks is becoming a pressing issue. Thispaper presents a model management system supporting various types ofdata mining models. It describes how to build and populate largeheterogeneous modelbases. It also presents a query language for queryingthese modelbases and examines performance results for some of the queries.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14127">
    <title>Building an Effective Representation for Dynamic Networks</title>
    <link>http://hdl.handle.net/2451/14127</link>
    <description>Title: Building an Effective Representation for Dynamic Networks&lt;br/&gt;&lt;br/&gt;Hill, Shawndra; Agarwal, Deepak; Bell, Robert; Volinsky, Chris&lt;br/&gt;&lt;br/&gt;Abstract: A dynamic network is a special type of network which is comprised ofconnected transactors which have repeated evolving interaction. Data onlarge dynamic networks such as telecommunications networks and theInternet are pervasive. However, representing dynamic networks in amanner that is conducive to efficient large-scale analysis is achallenge. In this paper, we represent dynamic graphs using a datastructure introduced by Cortes et. a]. [Q]. We advocate theirrepresentation because it accounts for the evolution of relationshipsbetween transactors through time, mitigates noise at the localtransactor level, and allows for the removal of stale relationships. Ourwork improves on their heuristic arguments by formalizing therepresentation with three tunable parameters. In doing this, we developa generic framework for evaluating and tuning any dynamic graph. We showthat the storage saving approximations involved in the representation donot affect predictive performance, and typically improve it. We motivateour approach using a fraud detection example from the telecommunicationsindustry, and demonstrate that we can outperform published results onthe fraud detection task. In addition, we present preliminary analysison web logs and email networks.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/28065">
    <title>Bayesian Learning in Social Networks</title>
    <link>http://hdl.handle.net/2451/28065</link>
    <description>Title: Bayesian Learning in Social Networks&lt;br/&gt;&lt;br/&gt;Lobel, Ilan; Dahleh, Munther; Acemoglu, Daron; Ozdaglar, Asuman&lt;br/&gt;&lt;br/&gt;Abstract: We study the (perfect Bayesian) equilibrium of a model of learning overa general so- cial network. Each individual receives a signal about theunderlying state of the world, observes the past actions of astochastically-generated neighborhood of individuals, and chooses one oftwo possible actions. The stochastic process generating theneighborhoods de&amp;macr;nes the network topology (social network). Thespecial case where each individual observes all past actions has beenwidely studied in the literature. We characterize pure-strategyequilibria for arbitrary stochastic and deterministic social networksand characterize the conditions under which there will be asymptoticlearning|that is, the conditions under which, as the social networkbecomes large, individuals converge (in probability) to taking the rightaction. We show that when private beliefs are unbounded (meaning thatthe implied likelihood ratios are unbounded), there will be asymptoticlearning as long as there is some minimal amount of \expansion inobservations&amp;quot;. Our main theorem shows that when the probabilitythat each individual observes some other individual from the recent pastconverges to one as the social network becomes large, un- boundedprivate beliefs are su&amp;plusmn;cient to ensure asymptotic learning. Thistheorem there- fore establishes that, with unbounded private beliefs,there will be asymptotic learning in almost all reasonable socialnetworks. We also show that for most network topologies, when privatebeliefs are bounded, there will not be asymptotic learning. In addition,in contrast to the special case where all past actions are observed,asymptotic learning is possible even with bounded beliefs in certainstochastic network topologies.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14152">
    <title>Bayesian Analysis and Model Revision for k&amp;acirc;th Order Markov
Chains with Unknown k.</title>
    <link>http://hdl.handle.net/2451/14152</link>
    <description>Title: Bayesian Analysis and Model Revision for k&amp;acirc;th Order MarkovChains with Unknown k.&lt;br/&gt;&lt;br/&gt;Radner, Roy&lt;br/&gt;&lt;br/&gt;Abstract: mass 1 concentrated on the true process, provided that the priorprobability measure has full support and the true process isirreducible. Second, I extend this result to the case in which k isunbounded (but finite), which requires that the Bayesian decisionmaker(DM) construct a prior on an infinite-dimensional parameter space.Finally, in an alternative approach to this case, I suppose that the DMconsiders a succession of models corresponding to larger and largervalues of k. Each time the DM revises his model he extends his priorprobability measure to the new - and larger - parameter space in a waythat is &amp;quot;consistent&amp;quot; with the previous prior, and recomputeshis posterior probability measures. I show that, roughly speaking, ifthe DM does not revise his model &amp;acirc;too frequently,&amp;acirc; thenhe will be increasingly confident that the current posterior isincreasingly concentrated on the true process. I motivate the procedureof model revision by considerations of bounded rationality.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14749">
    <title>Are Digital Rights Valuable? Theory and Evidence from eBook Pricing</title>
    <link>http://hdl.handle.net/2451/14749</link>
    <description>Title: Are Digital Rights Valuable? Theory and Evidence from eBook Pricing&lt;br/&gt;&lt;br/&gt;Oestreicher-Singer, Gal; Sundararajan, Arun&lt;br/&gt;&lt;br/&gt;Abstract: The effective management of digital rights is the central challenge inmany industries making the transition from physical to digital products.We present a new model that characterizes the value of these digitalrights when products are sold both embedded in tangible physicalartifacts, and as pure digital goods, and when granting rights permittedby one&amp;acirc;s digital rights management (DRM) platform may affect theextent of digital piracy. Our model indicates that in the absence ofpiracy, digital rights should be unrestricted, since a seller can useits pricing strategy to optimally balance sales between physical anddigital goods. However, the threat of piracy limits the extent to whichdigital rights should be granted: the value of digital rights isdetermined not only by their direct effect on the quality of legaldigital goods, but by a differential piracy effect that can lower aseller&amp;acirc;s pricing power. When the latter effect is sufficientlyhigh, granting digital rights can have a detrimental effect on value&amp;acirc; our model indicates that this kind of effect is more likely tobe observed for digital rights that aim to replicate the consumptionexperience of physical goods, rather than enhancing a customer&amp;acirc;sdigital experience. We test the predictions of our analytical modelusing data from the ebook industry. Our empirical evidence supports ourtheoretical results, showing that four separate digital rights each havean economically significant impact on ebook prices, and establishingthat the digital rights which aim to replicate physical consumptionwhile increasing the threat of piracy are the ones that have negativeimpact on seller value. We also show that if the pricing of a digitalgood is keyed off that of an existing tangible good, optimal pricingchanges for the former should be more nuanced, rather than simplymirroring changes in the price of the latter, and we discuss the effectof the technological sophistication of potential customers on optimalpricing and rights management. Our results represent new evidence of theimportance of an informed and judicious choice of the different digitalrights granted by a DRM platform, and provide a new framework forguiding managers in industries that are progressively being digitized.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/29801">
    <title>Analyzing the Amazon Mechanical Turk Marketplace</title>
    <link>http://hdl.handle.net/2451/29801</link>
    <description>Title: Analyzing the Amazon Mechanical Turk Marketplace&lt;br/&gt;&lt;br/&gt;Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: Since the concept of crowdsourcing is relatively new, many potentialparticipants have questions about the AMT marketplace. For example, acommon set of questions that pop up in an 'introduction to crowdsourcingand AMT' session are the following:  What type of tasks can be completedin the marketplace?  How much does it cost?  How fast can I get resultsback? How big is the AMT marketplace?  The answers for these questionsremain largely anecdotal and based on personal observations andexperiences. To understand better what types of tasks are beingcompleted today using crowdsourcing techniques, we started collectingdata about the AMT marketplace. We present a preliminary analysis of thedataset and provide directions for interesting future research.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14120">
    <title>Active Learning for Decision Making</title>
    <link>http://hdl.handle.net/2451/14120</link>
    <description>Title: Active Learning for Decision Making&lt;br/&gt;&lt;br/&gt;Saar-Tsechansky, Maytal; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: This paper addresses focused information acquisition for predictive datamining. As businesses strive to cater to the preferences of individualconsumers, they often employ predictive models to customize marketingefforts. Building accurate models requires information about consumerpreferences that often is costly to acquire. Prior research hasintroduced many &amp;acirc;  active learning&amp;acirc;   policies foridentifying information that is particularly useful for model induction,the goal being to reduce the acquisition cost necessary to induce amodel with a given accuracy. However, predictive models often are usedas part of a decision-making process, and costly improvements in modelaccuracy do not always result in better decisions. This paper develops anew approach for active information acquisition that targetsdecision-making specifically. The method we introduce departs from thetraditional error-reducing paradigm and places emphasis on acquisitionsthat are more likely to affect decision-making. Empirical evaluationswith direct marketing data demonstrate that for a fixed informationacquisition cost the method significantly improves the targetingdecisions. The method is designed to be generic&amp;acirc;  not based on asingle model or induction algorithm&amp;acirc;  and we show that it can beapplied effectively to various predictive modeling techniques.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14118">
    <title>ACORA: Distribution-Based Aggregation for Relational Learning from
Identifier Attributes</title>
    <link>http://hdl.handle.net/2451/14118</link>
    <description>Title: ACORA: Distribution-Based Aggregation for Relational Learning fromIdentifier Attributes&lt;br/&gt;&lt;br/&gt;Perlich, Claudia; Provost, Foster&lt;br/&gt;&lt;br/&gt;Abstract: Feature construction through aggregation plays an essential role inmodeling relational domains with one-to-many relationships betweentables. One-to-many relationships lead to bags (multisets) of relatedentities, from which predictive information must be captured. This paperfocuses on aggregation from categorical attributes that can take manyvalues (e.g., object identifiers). We present a novel aggregation methodas part of a relational learning system ACORA, that combines the use ofvector distance and meta-data about the class-conditional distributionsof attribute values. We provide a theoretical foundation for thisapproach deriving a &amp;quot;relational fixed-effect&amp;quot; model within aBayesian framework, and discuss the implications of identifieraggregation on the expressive power of the induced model. One advantageof using identifier attributes is the circumvention of limitationscaused either by missing/unobserved object properties or by independenceassumptions. Finally, we show empirically that the novel aggregators cangeneralize in the presence of identi- fier (and other high-dimensional)attributes, and also explore the limitations of the applicability of the methods.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14810">
    <title>A Strategic Analysis of Information Sharing Among Cyber Attackers</title>
    <link>http://hdl.handle.net/2451/14810</link>
    <description>Title: A Strategic Analysis of Information Sharing Among Cyber Attackers&lt;br/&gt;&lt;br/&gt;Ghose, Anindya; Hausken, Kjell&lt;br/&gt;&lt;br/&gt;Abstract: One firm invests in security to defend against cyber attacks by twohackers. Each hacker chooses an optimal attack, and they shareinformation with each other about the firm's vulnerabilities. Eachhacker prefers to receive information, but delivering gives competitiveadvantage to the other hacker. We find that each hacker's attack andinformation sharing are strategic complements while one hacker's attackand the other hacker's information sharing are strategic substitutes.The attack is inverse U-shaped in the firm's unit defense cost, andreaches zero, while the firm's defense and profit decrease, and thehackers' information sharing and profit increase. The firm's profitincreases in the hackers' unit cost of attack, while the hackers'information sharing and profit decrease. Our analysis also reveals theinteresting result that the cumulative attack level of the hackers isnot affected by the effectiveness of information sharing between themand moreover, is also unaffected by the intensity of joint informationsharing. We also find that as the effectiveness of information sharingbetween hackers increases relative to the investment in attack, thefirm's investment in cyber security defense and profit are constant, thehackers' investments in attacks decrease, and information sharing levelsand hacker profits increase. In contrast, as the intensity of jointinformation sharing increases, while the firm's investment in cybersecurity defense and profit remain constant, the hackers' investments inattacks increase, and the hackers' information sharing levels andprofits decrease. Increasing the firm's asset causes all the variablesto increase linearly, except information sharing which is constant. Weextend our analysis to endogenize the firm's asset and this analysislargely confirms the preceding analysis with a fixed asset.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/25886">
    <title>A Quality-Aware Optimizer for Information Extraction</title>
    <link>http://hdl.handle.net/2451/25886</link>
    <description>Title: A Quality-Aware Optimizer for Information Extraction&lt;br/&gt;&lt;br/&gt;Jain, Alpa; Ipeirotis, Panagiotis G.&lt;br/&gt;&lt;br/&gt;Abstract: Large amounts of structured information is buried in unstructured text.Information extraction systems can extract structured relations from thedocuments and enable sophisticated, SQL-like queries over unstructuredtext. Information extraction systems are not perfect and their outputhas imperfect precision and recall (i.e., contains spurious tuples andmisses good tuples). Typically, an extraction system has a set ofparameters that can be used as ``knobs'' and tune the system to beeither precision- or recall-oriented. Furthermore, the choice ofdocuments processed by the extraction system also affects the quality ofthe extracted relation. So far, estimating the output quality of aninformation extraction task was an ad-hoc procedure, based mainly onheuristics. In this paper, we show how to use receiver operatingcharacteristic (ROC) curves to estimate the extraction quality in astatistically robust way and show how to use ROC analysis to select theextraction parameters in a principled manner. Furthermore, we presentanalytic models that reveal how different document retrieval strategiesaffect the quality of the extracted relation. Finally, we present ourmaximum likelihood approach for estimating---on the fly---the parametersrequired by our analytic models to predict the run time and the outputquality of each execution plan. Our experimental evaluation demonstratesthat our optimization approach predicts accurately the output qualityand selects the fastest execution plan that satisfies the output quality restrictions.</description>
  </item>
  <item rdf:about="http://hdl.handle.net/2451/14809">
    <title>A Multi-Level Examination of the Impact of Social Identities on Economic
Transactions in Electronic Markets</title>
    <link>http://hdl.handle.net/2451/14809</link>
    <description>Title: A Multi-Level Examination of the Impact of Social Identities on EconomicTransactions in Electronic Markets&lt;br/&gt;&lt;br/&gt;Forman, Chris; Ghose, Anindya; Wiesenfeld, Batia&lt;br/&gt;&lt;br/&gt;Abstract: Three of the most important uses of the Internet today are as aneconomic marketplace, as a forum for social interaction, and as a sourceof information. In this paper, we explore how these three activitiescome together, in the form of emergent social communities built aroundinformation exchanges within IT-enabled electronic marketplaces. Drawingon social identity theory, we suggest that the relationship betweenonline consumer reviews and internet product sales is partiallyexplained by social identity processes. Using a unique dataset based onboth chronologically compiled ratings as well as reviewercharacteristics for a given set of products and geographicallocation-based purchasing behavior from Amazon, we provide evidence atthe community level linking the prevalence of identity claiming behaviorin an online community with subsequent product sales. In addition, weshow that when reviewers claim to be from a particular geographiclocation, subsequent product sales are higher in that region. At thereview level of analysis, we show that subsequent reviews conform toidentity-claiming norms set in previous reviews, and that identityclaiming that conforms to community norms elicits identity granting.Furthermore, our results suggest that the prevalence of identitygranting has implications for economic exchange in the form of productsales. Implications for research on word-of-mouth and electroniccommunities are discussed.</description>
  </item>
</rdf:RDF>

