diff --git a/dataFilter.py b/dataFilter.py index 7f4a9d2c90eae390e326beee972c087a83db7ec1..8bda05b28190995996f0278b814f0eef78b4ddf0 100755 --- a/dataFilter.py +++ b/dataFilter.py @@ -3,7 +3,7 @@ # Todo: # > use global vars for Col-indices. each function that changes em updates em # > compat /w ecoTrack ? 1nt3gr4t3. -# > think about putting blox-config from dataFilter rand dataSync in project-globl conf-file! +# > think about putting blox-config from dataFilter and dataSync in project-global conf-file! # import sys, os diff --git a/dataSync.py b/dataSync.py index f6b7220d6de0edb3876eb9c6956e51268e683d10..093e18189ce1818a9ae6b86960436b6075a185d1 100755 --- a/dataSync.py +++ b/dataSync.py @@ -1,22 +1,20 @@ #!/usr/bin/python3 # clean, sort, merge raw data -# +# ecolux and ecotrack Projects # output one huge table with all necessary fields for further processing/vizualization # -# files are 'skyglow.log','tags' and trackfiles (containing 'unit-x' in name, where x is a number) -# ############################ # TODO ############################ +# > rewrite getXData to initXData +# > add block name to blox. > add it to update-tags.sh +# # > parse moonmap and skyglowdict from skyglow Project sources # -> maybe integrate skyglow project as submodule # > integrate with ST # > make Skyglow optional -> TEST # > ST-Filenames should include 'unit-X'! -# # > how to process skylog error in merge() -# > ECOTRACK: include temperature data. Also humidity?? -# # > Speed Up! (takes 2min per 1mio lines) # 1. Use Threads for # https://realpython.com/python-concurrency/#multiprocessing-version @@ -40,21 +38,29 @@ blox=[ ("ecolux","./block-I/01-raw","./block-I/02-inspect","2020-07-21 00:00:00","2020-08-18 23:59:00"), # Block1: 21.7 - 18.8 ("ecolux","./block-II/01-raw","./block-II/02-inspect","2020-09-15 00:00:00","2020-10-13 23:59:00"), # Block2: 15.9 - 13.10 ("ecolux","./block-all/01-raw","./block-all/02-inspect","2020-07-21 00:00:00","2020-11-11 23:59:00"), # Blox1&2 + 4Weex -("ecotrack","/opt/virtualBoxVMs/share/block-I/01-raw","/opt/virtualBoxVMs/share/block-I/02-inspect","2019-10-14 00:00:00","2019-10-16 23:59:00"), -("ecolux","./test","./test","2019-09-14 00:00:00","2021-09-16 23:59:00"), +("ecotrack","/opt/virtualBoxVMs/share/block-I/01-raw","/opt/virtualBoxVMs/share/block-I/02-inspect","2019-09-26 00:00:00","2019-10-21 23:59:00"), +#temp 26.9 - 21.10 +("ecotrack","./test","./test","2019-09-26 00:00:00","2019-10-21 23:59:00"), +# +#NOTE: ecotrack time is in CEST (both track and temp) => UTC+2 +#("ecotrack",in,out,"2019-06-29 00:00:00", "2019-07-18 00:00:00") #block1 +#("ecotrack",in,out,"2019-08-05 00:00:00", "2019-08-16 00:00:00") #block2 +#("ecotrack",in,out,"2019-09-27 00:00:00","2019-10-11 00:00:00") #block3 +# ## -- add here -- ## ] -TIME_FMT='%Y-%m-%d %H:%M:00' +# TIME_FMT='%Y-%m-%d %H:%M:00' +TIME_FMT='%Y-%m-%d %H:%M:%S' noTime = dt.fromtimestamp(0) TAG_LEN=len("04B94A7F7288588022") RULER=f"{'#'*100}\n" rootPath = os.path.abspath( os.path.dirname( __file__ ) ) #uses cwd(), which is set to scriptdir at start +N_UNITS=12 def initFileStructure(): - # trackfile - ########## + ## trackFiles global T_DELIM, TIMESTAMP_LEN, T_COL_MS, T_COL_X, T_COL_Y, T_COL_TAG, T_MINLEN, T_MAXLEN, T_NCOLS T_DELIM=';' TIMESTAMP_LEN=10 @@ -64,18 +70,19 @@ def initFileStructure(): T_COL_Y=3 T_COL_TAG=4 #t_col_signalstrength=5 - if PROJ == "ecolux": + if PROJ == "ecolux": #track+light T_MINLEN = 38 # without newline. - T_MAXLEN = 40 # ms column varies 1-3. Will be padded later for uniform len + T_MAXLEN = 40 # ms are variable in length (0-999). Will be padded later for uniform len T_NCOLS = 6 - elif PROJ == "ecotrack": #ecotron temperature project 2019 + elif PROJ == "ecotrack": #track+temp 2019 T_MINLEN = 36 T_MAXLEN = 38 T_NCOLS = 5 + # elif PROJ == "schrebatron": + #SchrebaTron: ms is padded with zeros - # lightfile - ########## if hasLight(): + ## lightFile global L_DELIM, L_COL_MOON_REAL, L_COL_MOON_DMX, L_COL_SKYGLOW, L_MINLEN, L_MAXLEN, L_NCOLS L_DELIM='\t' L_COL_MOON_REAL=1 @@ -84,9 +91,21 @@ def initFileStructure(): L_MINLEN = 61 #2020-07-21 15:15:00 0.0 0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] L_MAXLEN = 76 #2020-07-21 15:09:00 0.0 0 [15, 54, 4, 19, 0, 1, 5, 0, 40, 12, 123, 123] L_NCOLS = 4 - - # animalfile - ########### + + if hasTemp(): + ## tempFile (C for Celsius) + global C_TIME_FMT, C_COL_TIME, C_COL_UNIT, C_COL_HABITAT, C_COL_TEMP, C_MIN_LEN, C_MAX_LEN, C_DELIM, C_NCOLS + C_TIME_FMT = "%Y-%m-%d %H:%M:%S.000" + C_COL_TIME = 0 + C_COL_UNIT = 1 + C_COL_HABITAT = 2 #??? qu'est-ce que c'est + C_COL_TEMP = 3 + C_MIN_LEN = 30 #coarse approximation + C_MAX_LEN = 40 + C_DELIM = ';' + C_NCOLS = 4 + + ## animalFile global A_DELIM, A_MINLEN, A_MAXLEN, A_NCOLS, A_COL_SPECIES, A_COL_SPECIES_IND, A_COL_WEIGHT_WO_TAG_MG, A_COL_TAG if PROJ == "ecolux": A_DELIM='\t' @@ -99,8 +118,8 @@ def initFileStructure(): A_COL_TAG=3 elif PROJ == "ecotrack": A_DELIM=';' - A_MINLEN = 39 - A_MAXLEN = 44 + A_MINLEN = 30 + A_MAXLEN = 45 A_NCOLS = 7 #/w comments A_COL_SPECIES=1 A_COL_SPECIES_IND=0 @@ -110,10 +129,11 @@ def initFileStructure(): # out file ########### - global OUT_FMT, DELIM, COL_UID, COL_TAG + global OUT_FMT, DELIM, COL_UID, COL_HID, COL_TAG OUT_FMT='%Y%m%d' DELIM=',' COL_UID=4 + COL_HID=5 COL_TAG=8 @@ -200,6 +220,9 @@ skyglowDict = { def hasLight(): return PROJ == "ecolux" +def hasTemp(): + return PROJ == "ecotrack" + def getUID(fileName): """ extract and return unit-id from fileName """ f=fileName.split('-') @@ -209,8 +232,7 @@ def getUID(fileName): class Data: - def __init__(self,animalFile, skyglowFile, trackFiles): - + def __init__(self, animalFile, skyglowFile, tempFile, trackFiles): self.animalData = self.getAnimalData(animalFile) self.trackData = [] @@ -227,6 +249,9 @@ class Data: self.startLight = self.getLightTime( self.lightData[0] ) self.endLight = self.getLightTime( self.lightData[-1] ) + if hasTemp(): + self.initTempData(tempFile) + global OUT_FILE OUT_FILE = f"{PROJ}-data-{self.startTrack.strftime(OUT_FMT)}-{self.endTrack.strftime(OUT_FMT)}" @@ -264,16 +289,7 @@ class Data: x = int(cols[T_COL_X]) y = int(cols[T_COL_Y]) - if x in (1,2) and y in (1,2): - habitat=1 - elif x in (5,6) and y in (1,2): - habitat=2 - elif x in (1,2) and y in (5,6): - habitat=3 - elif x in (5,6) and y in (5,6): - habitat=4 - else: - habitat=0 + habitat = self.calcHabitat(x,y) # add columns (date,time,uid,habitat) line = line[:idx_after_ms] + DELIM + date + DELIM + time + DELIM + str(uid) + DELIM + str(habitat) + line[idx_after_ms:] @@ -286,6 +302,32 @@ class Data: assert (lines), "trackData: Not valid. Check times / formatting." return lines + def calcHabitat(self,x,y): + if PROJ == "ecolux": + if x in (1,2) and y in (1,2): + return 1 + elif x in (5,6) and y in (1,2): + return 2 + elif x in (1,2) and y in (5,6): + return 3 + elif x in (5,6) and y in (5,6): + return 4 + else: + return 0 + elif PROJ == "ecotrack": + if x in (1,2) and y in (1,2,5,6): + return 2 + elif x in (1,2) and y in (3,4,7,8): + return 3 + elif x in (3,4) and y in (1,2,5,6): + return 3 + elif x in (3,4) and y in (3,4,7,8): + return 4 + else: + return "NA" + else: + return "NA" + def getLightData(self,file): with open(file) as f: @@ -297,27 +339,54 @@ class Data: assert (lines), "skyglow: Not valid. Check times / formatting." return lines + def initTempData(self,file): + """ get temp data and save in a list of list of tuples >> tempData[unit][habitat] returns (time,temp)""" + pp(f"processing: {file}") + with open(file) as f: + lines = f.readlines() + + lines = self.clean(lines, C_MIN_LEN, C_MAX_LEN, C_NCOLS, C_DELIM, self.getTempTime) + lines.sort() + + self.startTemp = self.getTempTime( lines[0] ) + self.endTemp = self.getTempTime( lines[-1] ) + self.lenTemp = len(lines) + + self.tempData = [[[],[],[],[]] for _ in range(N_UNITS) ] #create list of empty lists containing empty lists + for l in lines: + cols = l.split(DELIM) + + uid = int( cols[C_COL_UNIT] ) - 1 + hid = int( cols[C_COL_HABITAT] ) - 1 + time = dt.strptime( cols[C_COL_TIME],C_TIME_FMT ) + temp = float( cols[C_COL_TEMP] ) + + self.tempData[uid][hid].append( (time,temp) ) + def getAnimalData(self,file): with open(file) as f: lines = f.readlines() lines = self.clean(lines, A_MINLEN, A_MAXLEN, A_NCOLS, A_DELIM) - #TODO: check for duplicate tags and print warning / ignore? + #TODO: check for duplicate tags and print warning / ignore? assert (lines), "tags: Not valid. Check times / formatting." + return lines def clean(self, lines, minLen, maxLen, nCols, sep, timeFunc=None): """ remove trailing newline, empty, comments and all-digits and remove lines before startTime or after endTime""" lines = [l.strip() for l in lines] - lines = [l for l in lines if minLen <= len(l) <= maxLen] # ecolux: ms are variable in length (0-999), SchrebaTron: ms is padded with zeros + lines = [l for l in lines if minLen <= len(l) <= maxLen] lines = list(filter(lambda q: q and q[0] != '#', lines)) lines = [l for l in lines if len(l.split(sep)) == nCols] lines = [l.replace(sep,DELIM) for l in lines] + lines = [l.replace('\ufeff','') for l in lines] #cutout BOM (ByteOrderMark). Showed up in ecotrack temp-csv. + if timeFunc: lines = list(filter(lambda q: startTime <= timeFunc(q) < endTime, lines)) #only between start and endTime - + return lines def parseLightLine(self, line, uid): @@ -339,11 +408,13 @@ class Data: def merge(self): - self.merge_animals() + self.mergeAnimals() if hasLight(): - self.merge_light() + self.mergeLight() + if hasTemp(): + self.mergeTemp() - def merge_animals(self): + def mergeAnimals(self): """ merge AnimalData into TrackData on TagID """ pp("merging animal data into track data. This might take a while. Time to move around. Or grab a Coffee.") @@ -371,7 +442,7 @@ class Data: pp(f"Didn't find Tags for {notFound} tracking events!") - def merge_light(self): + def mergeLight(self): """merge LightData into TrackData on Time""" pp("merging light data into track data") @@ -380,15 +451,15 @@ class Data: lightTime = self.startLight trackTime = self.startTrack - while( trackTime < lightTime ): + while trackTime < lightTime: self.trackData[track_idx] += f'{DELIM}NA{DELIM}NA{DELIM}NA' track_idx+=1 - if( track_idx >= len(self.trackData) ): + if track_idx >= len(self.trackData): pp("skyglow.log starts after track.log...") return trackTime = self.getTrackTime(self.trackData[track_idx]) - pp(f"start merge @ {trackTime} -- idx {track_idx}") + pp(f"start light merge @ {trackTime} -- idx {track_idx}") for trackLine in self.trackData[track_idx:]: trackTime = self.getTrackTime(trackLine) @@ -406,6 +477,54 @@ class Data: self.trackData[track_idx] = trackLine track_idx+=1 + def mergeTemp(self): + """merge tempData into TrackData on Time""" + pp("merging temperature data into track data") + + track_idx = 0 + trackTime = self.startTrack + temp_idxs = [[0,0,0,0] for _ in range(N_UNITS) ] + + #fill NA's for tracking events starting before temperature + while trackTime < self.startTemp: + self.trackData[track_idx] += f'{DELIM}NA' + track_idx+=1 + if track_idx >= len(self.trackData): + pp("temperature starts after trackdata...") + return + trackTime = self.getTrackTime(self.trackData[track_idx]) + + + pp(f"start temperature merge @ {trackTime} -- idx {track_idx}") + + for trackLine in self.trackData[track_idx:]: + + cols = trackLine.split(DELIM) + hid = int(cols[COL_HID]) - 1 + uid = int(cols[COL_UID]) - 1 + trackTime = self.getTrackTime(trackLine) + +# earliestTime = min(self.tempData[uid][1],self.tempData[uid][2],self.tempData[uid][3],self.tempData[uid][4]) + + # get temp idx for temp within 2:30min + delta = datetime.timedelta(minutes=2, seconds=30) + i=temp_idxs[uid][hid] + while self.tempData[uid][hid][i][0] < trackTime-delta: + # pp(f"{self.tempData[uid][hid][i][0]} < {trackTime} - {delta} ?? {self.tempData[uid][hid][i][0] < trackTime-delta}") + if i >= len(self.tempData[uid][hid]): + break; + + i+=1 + + temp_idxs[uid][hid] = i + tempTime, temp = self.tempData[uid][hid][i] + + # add temp col + trackLine += DELIM + str(temp) + + self.trackData[track_idx] = trackLine + track_idx+=1 + def getLightTime(self,line): if not hasLight(): return @@ -421,7 +540,7 @@ class Data: winterTime = dt.strptime("2020-10-25 03:00:00",TIME_FMT) hours= ( 2 if lightTime < winterTime else 1 ) - lightTime -= datetime.timedelta(hours) + lightTime -= datetime.timedelta(hours=hours) # -2h in summmer, -1h in winter return lightTime @@ -436,19 +555,32 @@ class Data: return dt.fromtimestamp(timestamp) + def getTempTime(self,line): + cols = line.split(DELIM) + return dt.strptime(cols[C_COL_TIME],C_TIME_FMT) + def header(self): h= f"# {PROJ} data\n" \ f"{RULER}" \ - f"# len(Track): {len(self.trackData)}\n" \ - f"# startTrack: {self.startTrack}\n" \ - f"# endTrack: {self.endTrack}\n" \ - f"# len(animal): {len(self.animalData)}\n" + f"# len(Track): {len(self.trackData)}\n" \ + f"# startTrack: {self.startTrack}\n" \ + f"# endTrack: {self.endTrack}\n" \ + f"# len(animal): {len(self.animalData)}\n" if hasLight(): - h += f"# len(Light): {len(self.lightData)}\n" \ - f"# startLight: {self.startLight}\n" \ - f"# endLight: {self.endLight}\n" + h += f"# len(Light): {len(self.lightData)}\n" \ + f"# startLight: {self.startLight}\n" \ + f"# endLight: {self.endLight}\n" + if hasTemp(): + h += f"# len(Temperature): {self.lenTemp}\n" \ + f"# startTemp: {self.startTemp}\n" \ + f"# endTemp: {self.endTemp}\n" h += RULER - h += f"#timestamp, ms, date, time, unit, habitat, x, y, tag, species, speciesnumber, weight_without_tag[mg]{', moon_real[mLux], moon_eco[mLux], skyglow[Lux]' if hasLight() else ''}\n" + h += f"#timestamp, ms, date, time, unit, habitat, x, y, tag, species, speciesnumber, weight_without_tag[mg]" + if hasLight(): + h+=', moon_real[mLux], moon_eco[mLux], skyglow[Lux]' + if hasTemp(): + h+=', temp[C°]' + h +="\n" h += RULER return h @@ -475,29 +607,41 @@ def xtract(): def getFileList(): - animalFile, skyglowFile, trackFiles = '','',[] print( "Get files." ) + animalFile, skyglowFile, tempFile, trackFiles = '','','',[] + animalFileName="tags" + skyglowFileName="skyglow.log" + trackFileSub="unit-" + tempFileSub="emperature" + for root, dirs, files in os.walk(INPUTDIR): #walk recursively for file in files: - if "unit" in file and not file.endswith(".tgz") and not file.endswith(".tar.gz"): + if file.endswith(".tgz") or file.endswith(".tar.gz"): + continue + elif trackFileSub in file: trackFiles.append(file) - elif file == "tags": + elif file == animalFileName: animalFile = INPUTDIR+os.sep+file print(f"Found animalFile: {animalFile}") - elif hasLight() and file == "skyglow.log": + elif hasLight() and file == skyglowFileName: skyglowFile = INPUTDIR+os.sep+file print(f"Found logfile: {skyglowFile}") + elif hasTemp() and tempFileSub in file: + print(f"Found tempFile: {file}") + tempFile = INPUTDIR+os.sep+file else: pp(f"Ignoring ({file})") + if hasTemp() and not tempFile: + sys.exit(f"Couldn't find temperature data (filename containing '{tempFileSub}')") if not trackFiles: - sys.exit("No tracking data found. (filename containing unit-X, where X is a number)") + sys.exit(f"No tracking data found. (filenames containing '{trackFileSub}X', where X is a number)") if hasLight() and not skyglowFile: - sys.exit(f"Couldn't find 'skyglow.log'") + sys.exit(f"Couldn't find {skyglowFileName}") if not animalFile: - sys.exit(f"Couldn't find 'tags'") + sys.exit(f"Couldn't find {animalFileName}") - return animalFile, skyglowFile, trackFiles + return animalFile, skyglowFile, tempFile, trackFiles def make_rel_abs_path(path): @@ -509,10 +653,10 @@ def make_rel_abs_path(path): def main(): initFileStructure() - xtract() + # xtract() - animalFile, skyglowFile, trackFiles = getFileList() - data = Data(animalFile,skyglowFile,trackFiles) + animalFile, skyglowFile, tempFile, trackFiles = getFileList() + data = Data(animalFile,skyglowFile,tempFile,trackFiles) data.merge() data.write() @@ -520,10 +664,10 @@ def main(): if( platform.system() == "Linux" ): os.system(rootPath+"/updateGitIgnore.sh") - # os.system(rootPath+"/unknown-tags.sh ") + BLOCKS[1] <--- I , II, Ia, all ... TODO + os.system(rootPath+"/unknown-tags.sh ") #+ BLOCKS[1] <--- I , II, Ia, all ... TODO if __name__ == "__main__" : - known_projects=["ecotrack","ecolux","schrebatron"] + known_projects=["ecotrack","ecolux"] #,"schrebatron" print(f" Process raw data for which block?\n {RULER}") for idx, b in enumerate(blox): print(f" {idx+1}) Project {b[0]}") @@ -543,7 +687,7 @@ if __name__ == "__main__" : raise Exception(f"Unknown Project '{PROJ}'") except Exception as e: - print(f"Error: {e}\n") + print(f"Error: {e}\n") print("You can add a configuration by adding a line to the 'blox' structure. It has following form:\n") print(" (Project, inDir, outDir, start, end)\n" ) print(f"> Project must be in {known_projects}" ) @@ -554,4 +698,4 @@ if __name__ == "__main__" : start = time.time() main() end = time.time() - print(f"Took {end - start}") + print(f"Took {end - start} seconds") diff --git a/dataViz.py b/dataViz.py index 5e775faed65cadb93deb125d2fdb0cf8b089e738..4c7ac559a2ca1a09f05e934f8bf30332922262a3 100755 --- a/dataViz.py +++ b/dataViz.py @@ -2,7 +2,7 @@ # # VIZ (ET/ST) ################### -# sa: /home/kr69sugu/code/idiv/schrebatron3000Data/08_alpha_rep1_rep2/02_data_inspect/ana.py +# sa: ../schrebatrondata/08_alpha_rep1_rep2/02_data_inspect/ana.py # preq: should work on one huge table with all data! # > amount PatchCrossingEvents - Time -> activity histogram per unit/rep # > detection_duration_plot diff --git a/unknown-tags.sh b/unknown-tags.sh index 8686c5ea56c213be58a34149ccb071da7d6f829a..453f0611313d0ad39e943a744d187840ce591a49 100755 --- a/unknown-tags.sh +++ b/unknown-tags.sh @@ -10,7 +10,7 @@ if (( $# < 1 )); then echo "2. $this n list" echo " write list of unknown tags together with their occurrences to file unknowntags-block-{n}" echo "" - echo "Looks for data-file ( ${scriptdir}/block-{n}/02-inspect/ecolux-data-*-filtered )" + echo "Looks for data-file ( ${scriptdir}/block-{n}/02-inspect/eco*-data-*-filtered )" exit fi @@ -25,7 +25,7 @@ COL_TAG=13 HEADER_SIZE=1 # fi -in=`ls ${scriptdir}/block-${1}/02-inspect/ecolux-data-*-filtered | head -1` +in=`ls ${scriptdir}/block-${1}/02-inspect/eco*-data-*-filtered | head -1` LEN_W_HEADER=`wc -l "$in" | cut -d" " -f1` LEN=$(( $LEN_W_HEADER - $HEADER_SIZE))