forked from tomheon/git_by_a_bus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
estimate_unique_knowledge.py
151 lines (125 loc) · 5.46 KB
/
estimate_unique_knowledge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Estimate the unique knowledge encpsulated in a single FileData line.
Currently uses a sequential strategy like so:
for each revision in the log:
* take the difference of the added and deleted lines of that revision.
* if the difference is positive, add those lines as knowledge for the
dev that authored the revision.
* if the difference is negative, take the pct of the entire knowledge
currently in the file represented by the number of negative lines
and destroy that knowledge proportionally among all
knowledge-holders.
* take the min of the added and deleted lines. This represents the
'churn'--lines that have changed. Create (knowledge_churn_constant
* churn) new lines worth of knowledge and assign it to the dev who
authored the revision. Take (churn - new_knowledge) lines of
knowledge proportionally from all knowledge that dev doesn't share
and move it to a shared account.
"""
import sys
import math
import copy
from optparse import OptionParser
from common import FileData
def sequential_create_knowledge(dev_uniq, dev, adjustment):
"""
Create adjustment lines of knowledge in dev's account.
dev is a list of developers who shared this knowledge (may be only
one).
"""
if dev not in dev_uniq:
dev_uniq[dev] = 0
dev_uniq[dev] += adjustment
def sequential_destroy_knowledge(adjustment, tot_knowledge, dev_uniq):
"""
Find the percentage of tot_knowledge the adjustment represents and
destroy that percent knowledge in all knowledge accounts.
If tot_knowledge is 0, destroys nothing by definition.
"""
pct_to_destroy = 0
if tot_knowledge:
pct_to_destroy = abs(float(adjustment)) / float(tot_knowledge)
for devs in dev_uniq:
k = dev_uniq[devs]
k -= k * pct_to_destroy
dev_uniq[devs] = k
def sequential_share_knowledge_group(dev, shared_key_exploded, pct_to_share, dev_uniq):
"""
Share pct_to_share knowledge from all accounts that dev doesn't
belong to into corresponding accounts dev does belong to.
"""
old_shared_key = '\0'.join(shared_key_exploded)
shared_key_exploded.append(dev)
# make sure the dev names stay alphabetical
shared_key_exploded.sort()
new_shared_key = '\0'.join(shared_key_exploded)
group_knowledge = dev_uniq[old_shared_key]
amt_to_share = float(pct_to_share) * float(group_knowledge)
dev_uniq[old_shared_key] -= amt_to_share
if new_shared_key not in dev_uniq:
dev_uniq[new_shared_key] = 0
dev_uniq[new_shared_key] += amt_to_share
def sequential_distribute_shared_knowledge(dev, shared_knowledge, tot_knowledge, dev_uniq):
"""
Share the percent of knowledge represented by shared_knowledge of
tot_knowledge from all accounts that dev doesn't belong to into
(possibly new) accounts that he does.
"""
pct_to_share = 0
if tot_knowledge:
pct_to_share = float(shared_knowledge) / float(tot_knowledge)
for shared_key in dev_uniq.keys():
shared_key_exploded = shared_key.split('\0')
if dev not in shared_key_exploded:
sequential_share_knowledge_group(dev, shared_key_exploded, pct_to_share, dev_uniq)
def sequential_estimate_uniq(fd, knowledge_churn_constant):
"""
Estimate the amounts of unique knowledge for each developer who
has made changes to the path represented by this FileData, using a
knowledge_churn_constant indicating what pct of churned lines to
treat as new knowledge.
Returns a list of [([dev1, dev2...], knowledge), ...], indicating
the knowledge shared uniquely by the group of devs in the first
field (there may be only dev in the list or many)
"""
tot_knowledge = 0
dev_uniq = {}
for dev, added, deleted in fd.dev_experience:
adjustment = added - deleted
if adjustment > 0:
sequential_create_knowledge(dev_uniq, dev, adjustment)
elif adjustment < 0:
sequential_destroy_knowledge(adjustment, tot_knowledge, dev_uniq)
churn = min(added, deleted)
if churn != 0:
new_knowledge = float(churn) * knowledge_churn_constant
shared_knowledge = float(churn) - new_knowledge
sequential_distribute_shared_knowledge(dev, shared_knowledge, tot_knowledge, dev_uniq)
sequential_create_knowledge(dev_uniq, dev, new_knowledge)
tot_knowledge += adjustment + (churn * knowledge_churn_constant)
dev_uniq = [(shared_key.split('\0'), shared) for shared_key, shared in dev_uniq.items()]
return dev_uniq, int(tot_knowledge)
def sequential(lines, model_args):
"""
Entry point for the sequential algorithm.
See the description in the file docs.
Yields FileData objects as tsv lines, with dev_uniq and
tot_knowledge fields filled in.
"""
knowledge_churn_constant = float(model_args[0])
for line in lines:
fd = FileData(line)
dev_uniq, tot_knowledge = sequential_estimate_uniq(fd, knowledge_churn_constant)
fd.dev_uniq = dev_uniq
fd.tot_knowledge = tot_knowledge
yield fd.as_line()
if __name__ == '__main__':
parser = OptionParser()
parser.add_option('--model', dest='model', metavar='MODEL[:MARG1[:MARG2]...]', default="sequential:0.1",
help='Knowledge model to use, with arguments.')
options, args = parser.parse_args()
model = options.model.split(':')
model_func = locals()[model[0]]
model_args = model[1:]
for line in model_func(sys.stdin, model_args):
print line