diff --git a/edx/analytics/tasks/insights/user_activity.py b/edx/analytics/tasks/insights/user_activity.py index a81b946432..5c2e6defa0 100644 --- a/edx/analytics/tasks/insights/user_activity.py +++ b/edx/analytics/tasks/insights/user_activity.py @@ -45,8 +45,10 @@ def mapper(self, line): return event, date_string = value - username = event.get('username', '').strip() - if not username: + user_id = event.get('context', {}).get('user_id') + if not user_id: + self.incr_counter('UserActivity', 'Discard Missing User ID', 1) + log.error("User-Activity: event without user_id in context: %s", event) return course_id = eventlog.get_course_id(event) @@ -54,7 +56,7 @@ def mapper(self, line): return for label in self.get_predicate_labels(event): - yield date_string, self._encode_tuple((course_id, username, date_string, label)) + yield date_string, self._encode_tuple((str(user_id), course_id, date_string, label)) def get_predicate_labels(self, event): """Creates labels by applying hardcoded predicates to a single event.""" @@ -111,15 +113,15 @@ def multi_output_reducer(self, _date_string, values, output_file): counter = Counter(values) for key, num_events in counter.iteritems(): - course_id, username, date_string, label = key - value = (course_id, username, date_string, label, num_events) + user_id, course_id, date_string, label = key + value = (user_id, course_id, date_string, label, num_events) output_file.write('\t'.join([str(field) for field in value])) output_file.write('\n') def output_path_for_key(self, key): date_string = key return url_path_join( - self.hive_partition_path('user_activity', date_string), + self.hive_partition_path('user_activity_by_user', date_string), 'user_activity_{date}'.format( date=date_string, ) @@ -203,17 +205,17 @@ def spark_job(self, *args): df = df.filter( (df['event_source'] != 'task') & ~ df['event_type'].startswith('edx.course.enrollment.') & - (df['username'] != '') + (df['context.user_id'] != '') ) # passing complete row to UDF df = df.withColumn('all_labels', get_labels(df['event_type'], df['event_source'])) \ .withColumn('course_id', get_courseid(df['context'])) df = df.filter(df['course_id'] != '') # remove rows with empty course_id df = df.withColumn('label', explode(split(df['all_labels'], ','))) - result = df.select('course_id', 'username', 'event_date', 'label') \ - .groupBy('course_id', 'username', 'event_date', 'label').count() + result = df.select('context.user_id', 'course_id', 'event_date', 'label') \ + .groupBy('user_id', 'course_id', 'event_date', 'label').count() result = result.withColumn('dt', lit(result['event_date'])) # generate extra column for partitioning - result.coalesce(1).write.partitionBy('dt').csv(self.output_dir().path, mode='append', sep='\t') + result.coalesce(4).write.partitionBy('dt').csv(self.output_dir().path, mode='append', sep='\t') class UserActivityDownstreamMixin(WarehouseMixin, EventLogSelectionDownstreamMixin, MapReduceJobTaskMixin): @@ -256,7 +258,7 @@ def query(self): @property def table(self): - return 'user_activity' + return 'user_activity_by_user' @property def partition_by(self): @@ -265,8 +267,8 @@ def partition_by(self): @property def columns(self): return [ + ('user_id', 'INT'), ('course_id', 'STRING'), - ('username', 'STRING'), ('date', 'STRING'), ('category', 'STRING'), ('count', 'INT'), @@ -314,7 +316,7 @@ def requires(self): def user_activity_hive_table_path(self, *args): return url_path_join( self.warehouse_path, - 'user_activity' + 'user_activity_by_user' ) def calendar_hive_table_path(self, *args): @@ -329,8 +331,8 @@ def calendar_hive_table_path(self, *args): def get_user_activity_table_schema(self): from pyspark.sql.types import StructType, StringType - schema = StructType().add("course_id", StringType(), True) \ - .add("username", StringType(), True) \ + schema = StructType().add("user_id", StringType(), True) \ + .add("course_id", StringType(), True) \ .add("date", StringType(), True) \ .add("category", StringType(), True) \ .add("count", StringType(), True) \ @@ -361,7 +363,7 @@ def spark_job(self, *args): sep='\t', schema=self.get_calendar_table_schema() ) - user_activity_df.createOrReplaceTempView('user_activity') + user_activity_df.createOrReplaceTempView('user_activity_by_user') calendar_df.createOrReplaceTempView('calendar') query = """ SELECT @@ -369,8 +371,8 @@ def spark_job(self, *args): CONCAT(cal.iso_week_start, " 00:00:00") as interval_start, CONCAT(cal.iso_week_end, " 00:00:00") as interval_end, act.category as label, - COUNT (DISTINCT username) as count - FROM user_activity act + COUNT (DISTINCT user_id) as count + FROM user_activity_by_user act JOIN calendar cal ON act.date = cal.date AND act.dt >= "{interval_start}" AND act.dt < "{interval_end}" WHERE @@ -445,8 +447,8 @@ def query(self): CONCAT(cal.iso_week_start, ' 00:00:00') as interval_start, CONCAT(cal.iso_week_end, ' 00:00:00') as interval_end, act.category as label, - COUNT(DISTINCT username) as count - FROM user_activity act + COUNT(DISTINCT user_id) as count + FROM user_activity_by_user act JOIN calendar cal ON act.`date` = cal.`date` AND act.dt >= "{interval_start}" AND act.dt < "{interval_end}" WHERE diff --git a/edx/analytics/tasks/tests/acceptance/fixtures/input/user_activity_tracking.log b/edx/analytics/tasks/tests/acceptance/fixtures/input/user_activity_tracking.log index 0d03bd1fed..b62c49f5c4 100644 --- a/edx/analytics/tasks/tests/acceptance/fixtures/input/user_activity_tracking.log +++ b/edx/analytics/tasks/tests/acceptance/fixtures/input/user_activity_tracking.log @@ -81,7 +81,7 @@ {"username": "staff", "host": "example.m.sandbox.edx.org", "event_source": "server", "event_type": "/courses/edX/Open_DemoX/edx_demo_course/xblock/i4x:;_;_edX;_Open_DemoX;_problem;_Sample_Algebraic_Problem/handler/xmodule_handler/problem_get", "context": {"username": "staff", "course_user_tags": {}, "user_id": 4, "ip": "127.0.0.1", "org_id": "edX", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "host": "example.m.sandbox.edx.org", "session": "4b7ab990d449aa6958ffd5e0ac7ee3f7", "course_id": "edX/Open_DemoX/edx_demo_course", "path": "/courses/edX/Open_DemoX/edx_demo_course/xblock/i4x:;_;_edX;_Open_DemoX;_problem;_Sample_Algebraic_Problem/handler/xmodule_handler/problem_get"}, "time": "2014-06-19T17:52:54.803303+00:00", "ip": "127.0.0.1", "event": "{\"POST\": {}, \"GET\": {}}", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "page": null} {"username": "staff", "host": "example.m.sandbox.edx.org", "event_source": "server", "event_type": "/courses/edX/Open_DemoX/edx_demo_course/xblock/i4x:;_;_edX;_Open_DemoX;_problem;_Sample_Algebraic_Problem/handler/xmodule_handler/problem_check", "context": {"username": "staff", "course_user_tags": {}, "user_id": 4, "ip": "127.0.0.1", "org_id": "edX", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "host": "example.m.sandbox.edx.org", "session": "4b7ab990d449aa6958ffd5e0ac7ee3f7", "course_id": "edX/Open_DemoX/edx_demo_course", "path": "/courses/edX/Open_DemoX/edx_demo_course/xblock/i4x:;_;_edX;_Open_DemoX;_problem;_Sample_Algebraic_Problem/handler/xmodule_handler/problem_check"}, "time": "2014-06-19T18:27:00.474818+00:00", "ip": "127.0.0.1", "event": "{\"POST\": {\"input_i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1_dynamath\": [\"\\r\\n \\r\\n A\\r\\n \\r\\n \\r\\n x\\r\\n 2\\r\\n \\r\\n +\\r\\n \\r\\n \\r\\n y\\r\\n \\r\\n \\r\\n \\r\\n\"], \"input_i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1\": [\"A*x^2 + sqrt(y)\"]}, \"GET\":", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "page": null} {"username": "staff", "event_type": "problem_check", "ip": "127.0.0.1", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "host": "example.m.sandbox.edx.org", "session": "4b7ab990d449aa6958ffd5e0ac7ee3f7", "event": "\"input_i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1=A*x%5E2+%2B+sqrt(y)&input_i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1_dynamath=%3Cmath+xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F1998%2FMath%2FMathML%22%3E%0D%0A++%3Cmstyle+displaystyle%3D%22true%22%3E%0D%0A++++%3Cmi%3EA%3C%2Fmi%3E%0D%0A++++%3Cmo%3E%26%23x22C5%3B%3C%2Fmo%3E%0D%0A++++%3Cmsup%3E%0D%0A++++++%3Cmi%3Ex%3C%2Fmi%3E%0D%0A++++++%3Cmn%3E2%3C%2Fmn%3E%0D%0A++++%3C%2Fmsup%3E%0D%0A++++%3Cmo%3E%2B%3C%2Fmo%3E%0D%0A++++%3Cmsqrt%3E%0D%0A++++++%3Cmrow%3E%0D%0A++++++++%3Cmi%3Ey%3C%2Fmi%3E%0D%0A++++++%3C%2Fmrow%3E%0D%0A++++%3C%2Fmsqrt%3E%0D%0A++%3C%2Fmstyle%3E%0D%0A%3C%2Fmath%3E\"", "event_source": "browser", "context": {"username": "staff", "user_id": 4, "ip": "127.0.0.1", "org_id": "edX", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "host": "example.m.sandbox.edx.org", "session": "4b7ab990d449aa6958ffd5e0ac7ee3f7", "course_id": "edX/Open_DemoX/edx_demo_course", "path": "/event"}, "time": "2014-06-19T18:27:00.492394+00:00", "page": "http://example.m.sandbox.edx.org/courses/edX/Open_DemoX/edx_demo_course/courseware/interactive_demonstrations/basic_questions/"} -{"username": "bill", "host": "example.m.sandbox.edx.org", "event_source": "server", "event_type": "problem_check", "context": {"username": "bill", "course_user_tags": {}, "user_id": 4, "ip": "127.0.0.1", "org_id": "edX", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "host": "example.m.sandbox.edx.org", "session": "4b7ab990d449aa6958ffd5e0ac7ee3f7", "module": {"display_name": "Mathematical Expressions"}, "course_id": "edX/Open_DemoX/edx_demo_course", "path": "/courses/edX/Open_DemoX/edx_demo_course/xblock/i4x:;_;_edX;_Open_DemoX;_problem;_Sample_Algebraic_Problem/handler/xmodule_handler/problem_check"}, "time": "2014-06-19T18:27:01.037362+00:00", "ip": "127.0.0.1", "event": {"submission": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1_dynamath": {"input_type": "", "question": "", "response_type": "", "answer": "\r\n \r\n A\r\n \r\n \r\n x\r\n 2\r\n \r\n +\r\n \r\n \r\n y\r\n \r\n \r\n \r\n", "variant": "", "correct": ""}, "i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": {"input_type": "textline", "question": "", "response_type": "formularesponse", "answer": "A*x^2 + sqrt(y)", "variant": "", "correct": true}}, "success": "correct", "grade": 1, "correct_map": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": {"hint": "", "hintmode": null, "correctness": "correct", "npoints": null, "msg": "", "queuestate": null}}, "state": {"student_answers": {}, "seed": 1, "done": null, "correct_map": {}, "input_state": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": {}}}, "answers": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1_dynamath": "\r\n \r\n A\r\n \r\n \r\n x\r\n 2\r\n \r\n +\r\n \r\n \r\n y\r\n \r\n \r\n \r\n", "i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": "A*x^2 + sqrt(y)"}, "attempts": 1, "max_grade": 1, "problem_id": "i4x://edX/Open_DemoX/problem/Sample_Algebraic_Problem"}, "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "page": "x_module"} +{"username": "bill", "host": "example.m.sandbox.edx.org", "event_source": "server", "event_type": "problem_check", "context": {"username": "bill", "course_user_tags": {}, "user_id": 5, "ip": "127.0.0.1", "org_id": "edX", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "host": "example.m.sandbox.edx.org", "session": "4b7ab990d449aa6958ffd5e0ac7ee3f7", "module": {"display_name": "Mathematical Expressions"}, "course_id": "edX/Open_DemoX/edx_demo_course", "path": "/courses/edX/Open_DemoX/edx_demo_course/xblock/i4x:;_;_edX;_Open_DemoX;_problem;_Sample_Algebraic_Problem/handler/xmodule_handler/problem_check"}, "time": "2014-06-19T18:27:01.037362+00:00", "ip": "127.0.0.1", "event": {"submission": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1_dynamath": {"input_type": "", "question": "", "response_type": "", "answer": "\r\n \r\n A\r\n \r\n \r\n x\r\n 2\r\n \r\n +\r\n \r\n \r\n y\r\n \r\n \r\n \r\n", "variant": "", "correct": ""}, "i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": {"input_type": "textline", "question": "", "response_type": "formularesponse", "answer": "A*x^2 + sqrt(y)", "variant": "", "correct": true}}, "success": "correct", "grade": 1, "correct_map": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": {"hint": "", "hintmode": null, "correctness": "correct", "npoints": null, "msg": "", "queuestate": null}}, "state": {"student_answers": {}, "seed": 1, "done": null, "correct_map": {}, "input_state": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": {}}}, "answers": {"i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1_dynamath": "\r\n \r\n A\r\n \r\n \r\n x\r\n 2\r\n \r\n +\r\n \r\n \r\n y\r\n \r\n \r\n \r\n", "i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1": "A*x^2 + sqrt(y)"}, "attempts": 1, "max_grade": 1, "problem_id": "i4x://edX/Open_DemoX/problem/Sample_Algebraic_Problem"}, "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "page": "x_module"} {"username": "staff", "event_type": "problem_graded", "ip": "127.0.0.1", "agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36", "host": "example.m.sandbox.edx.org", "session": "4b7ab990d449aa6958ffd5e0ac7ee3f7", "event": "[\"input_i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1=A*x%5E2+%2B+sqrt(y)&input_i4x-edX-Open_DemoX-problem-Sample_Algebraic_Problem_2_1_dynamath=%3Cmath+xmlns%3D%22http%3A%2F%2Fwww.w3.org%2F1998%2FMath%2FMathML%22%3E%0D%0A++%3Cmstyle+displaystyle%3D%22true%22%3E%0D%0A++++%3Cmi%3EA%3C%2Fmi%3E%0D%0A++++%3Cmo%3E%26%23x22C5%3B%3C%2Fmo%3E%0D%0A++++%3Cmsup%3E%0D%0A++++++%3Cmi%3Ex%3C%2Fmi%3E%0D%0A++++++%3Cmn%3E2%3C%2Fmn%3E%0D%0A++++%3C%2Fmsup%3E%0D%0A++++%3Cmo%3E%2B%3C%2Fmo%3E%0D%0A++++%3Cmsqrt%3E%0D%0A++++++%3Cmrow%3E%0D%0A++++++++%3Cmi%3Ey%3C%2Fmi%3E%0D%0A++++++%3C%2Fmrow%3E%0D%0A++++%3C%2Fmsqrt%3E%0D%0A++%3C%2Fmstyle%3E%0D%0A%3C%2Fmath%3E\",\"\\n\\n\\n

\\n Mathematical Expressions\\n

\\n\\n
\\n\\n
\\n

Some edX courses ask you to enter an algebraic expression as an answer. Try entering the following algebraic expression in the box below. It’s easier than it looks.

\\\\(A \\\\cdot x^2 + \\\\sqrt{y}\\\\) \\n

\\nThe entry is case sensitive. The product must be indicated with an asterisk, and the exponentiation with a caret, so you would write \\n\\\"A*x^2 + sqrt(y)\\\".

\\n A*x^2 + sqrt(y)\\n -\\n correct\\n

`{::}`