start-ver=1.4
cd-journal=joma
no-vol=E103.D
cd-vols=
no-issue=10
article-no=
start-page=2094
end-page=2103
dt-received=
dt-revised=
dt-accepted=
dt-pub-year=2020
dt-pub=20201001
dt-online=
en-article=
kn-article=
en-subject=
kn-subject=
en-title=
kn-title=Empirical Evaluation of Mimic Software Project Data Sets for Software Effort Estimation
en-subtitle=
kn-subtitle=
en-abstract=
kn-abstract=To conduct empirical research on industry software development, it is necessary to obtain data of real software projects from industry. However, only few such industry data sets are publicly available; and unfortunately, most of them are very old. In addition, most of today's software companies cannot make their data open, because software development involves many stakeholders, and thus, its data confidentiality must be strongly preserved. To that end, this study proposes a method for artificially generating a “mimic” software project data set, whose characteristics (such as average, standard deviation and correlation coefficients) are very similar to a given confidential data set. Instead of using the original (confidential) data set, researchers are expected to use the mimic data set to produce similar results as the original data set. The proposed method uses the Box-Muller transform for generating normally distributed random numbers; and exponential transformation and number reordering for data mimicry. To evaluate the efficacy of the proposed method, effort estimation is considered as potential application domain for employing mimic data. Estimation models are built from 8 reference data sets and their concerning mimic data. Our experiments confirmed that models built from mimic data sets show similar effort estimation performance as the models built from original data sets, which indicate the capability of the proposed method in generating representative samples.
en-copyright=
kn-copyright=

en-aut-name=GanMaohua
en-aut-sei=Gan
en-aut-mei=Maohua
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=1
ORCID=

en-aut-name=YücelZeynep
en-aut-sei=Yücel
en-aut-mei=Zeynep
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=2
ORCID=

en-aut-name=MondenAkito
en-aut-sei=Monden
en-aut-mei=Akito
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=3
ORCID=

en-aut-name=SasakiKentaro
en-aut-sei=Sasaki
en-aut-mei=Kentaro
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=4
ORCID=

affil-num=1
en-affil=Okayama University 
kn-affil=

affil-num=2
en-affil=Okayama University
kn-affil=

affil-num=3
en-affil=Okayama University
kn-affil=

affil-num=4
en-affil=Okayama University
kn-affil=
en-keyword=empirical software engineering
kn-keyword=empirical software engineering
en-keyword=data confidentiality
kn-keyword=data confidentiality
en-keyword=data mining
kn-keyword=data mining
END

start-ver=1.4
cd-journal=joma
no-vol=E103.D
cd-vols=
no-issue=8
article-no=
start-page=1865
end-page=1874
dt-received=
dt-revised=
dt-accepted=
dt-pub-year=2020
dt-pub=20200801
dt-online=
en-article=
kn-article=
en-subject=
kn-subject=
en-title=
kn-title=An Algorithm for Automatic Collation of Vocabulary Decks Based on Word Frequency
en-subtitle=
kn-subtitle=
en-abstract=
kn-abstract=This study focuses on computer based foreign language vocabulary learning systems. Our objective is to automatically build vocabulary decks with desired levels of relative difficulty relations. To realize this goal, we exploit the fact that word frequency is a good indicator of vocabulary difficulty. Subsequently, for composing the decks, we pose two requirements as uniformity and diversity. Namely, the difficulty level of the cards in the same deck needs to be uniform enough so that they can be grouped together and difficulty levels of the cards in different decks need to be diverse enough so that they can be grouped in different decks. To assess uniformity and diversity, we use rank-biserial correlation and propose an iterative algorithm, which helps in attaining desired levels of uniformity and diversity based on word frequency in daily use of language. In experiments, we employed a spaced repetition flashcard software and presented users various decks built with the proposed algorithm, which contain cards from different content types. From users' activity logs, we derived several behavioral variables and examined the polyserial correlation between these variables and difficulty levels across different word classes. This analysis confirmed that the decks compiled with the proposed algorithm induce an effect on behavioral variables in line with the expectations. In addition, a series of experiments with decks involving varying content types confirmed that this relation is independent of word class.
en-copyright=
kn-copyright=

en-aut-name=YücelZeynep
en-aut-sei=Yücel
en-aut-mei=Zeynep
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=1
ORCID=

en-aut-name=SupitayakulParisa
en-aut-sei=Supitayakul
en-aut-mei=Parisa
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=2
ORCID=

en-aut-name=MondenAkito
en-aut-sei=Monden
en-aut-mei=Akito
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=3
ORCID=

en-aut-name=LeelaprutePattara
en-aut-sei=Leelaprute
en-aut-mei=Pattara
kn-aut-name=
kn-aut-sei=
kn-aut-mei=
aut-affil-num=4
ORCID=

affil-num=1
en-affil=Okayama University
kn-affil=

affil-num=2
en-affil=Okayama University
kn-affil=

affil-num=3
en-affil=Okayama University
kn-affil=

affil-num=4
en-affil=Department of Computer Engineering, Faculty of Engineering, Kasetsart University
kn-affil=
en-keyword=e-learning
kn-keyword=e-learning
en-keyword=vocabulary learning
kn-keyword=vocabulary learning
en-keyword=log file analysis
kn-keyword=log file analysis
END