c# - Fastest way to get directory data in .NET -
c# - Fastest way to get directory data in .NET -
this question has reply here:
is there faster way scan through directory recursively in .net? 8 answersi'm working on file synchronization service synchronize files between 2 folders on different machines. need find fast way enumerate directory , pull next info it:
a info construction or structures of file paths , sub-directory paths within directory, includes lastly write time each file or sub-directory. for each sub-directory found @ level below current directory, same above.so far, have come this:
static void main(string[] args) { list<tuple<string, datetime>> files = new list<tuple<string, datetime>>(); list<tuple<string, datetime>> directories = new list<tuple<string, datetime>>(); stopwatch watch = new stopwatch(); while (true) { watch.start(); while (!checkfolderrecursivesinglethreaded("c:\\", out files, out directories)) { // can assume intents , purposes drive c exist , have access it, cause sleep not called. thread.sleep(1000); } watch.stop(); console.writeline(watch.elapsedmilliseconds); watch.reset(); // information. thread.sleep(1000); } } static bool checkfolderrecursivesinglethreaded(string path, out list<tuple<string, datetime>> files, out list<tuple<string, datetime>> directories) { seek { directoryinfo directoryinformation = new directoryinfo(path); list<tuple<string, datetime>> filelist = new list<tuple<string, datetime>>(); foreach (fileinfo file in directoryinformation.getfiles()) { filelist.add(new tuple<string, datetime>(file.fullname, file.lastwritetimeutc)); } list<tuple<string, datetime>> directorylist = new list<tuple<string, datetime>>(); foreach (directoryinfo directory in directoryinformation.getdirectories()) { // check reparsepoint flag, indicate symbolic link. if (!directory.attributes.hasflag(fileattributes.reparsepoint)) { directorylist.add(new tuple<string, datetime>(directory.fullname, directory.lastwritetimeutc)); list<tuple<string, datetime>> directoryfiles; list<tuple<string, datetime>> directoryfolders; if (checkfolderrecursivesinglethreaded(directory.fullname, out directoryfiles, out directoryfolders)) { filelist.addrange(directoryfiles); directorylist.addrange(directoryfolders); } } } files = filelist; directories = directorylist; homecoming true; } grab { files = null; directories = null; homecoming false; } }
performance-wise, takes 22 seconds (regardless of running in release or debug mode without debugger attached) enumerate through c:\ drive , produce list of 549,254 files , 83,235 folders has access to, can faster? open suggestions, msvc++ suggestions.
edit: 12 seconds linq's asparallel because of multithreading (must tested in release mode). note parallelizes c:\ sub-folders, recursive calls made single-threaded implementation have above, otherwise take long time parallelize folders time!
static bool checkfolderparallelled(string path, out list<tuple<string, datetime>> files, out list<tuple<string, datetime>> directories) { seek { directoryinfo directoryinformation = new directoryinfo(path); list<tuple<string, datetime>> filelist = new list<tuple<string, datetime>>(); foreach (fileinfo file in directoryinformation.getfiles()) { filelist.add(new tuple<string, datetime>(file.fullname, file.lastwritetimeutc)); } list<tuple<string, datetime>> directorylist = new list<tuple<string, datetime>>(); directoryinformation.getdirectories().asparallel().forall(directory => { // check reparsepoint flag, indicate symbolic link. if (!directory.attributes.hasflag(fileattributes.reparsepoint)) { directorylist.add(new tuple<string, datetime>(directory.fullname, directory.lastwritetimeutc)); list<tuple<string, datetime>> directoryfiles; list<tuple<string, datetime>> directoryfolders; if (checkfolderrecursivesinglethreaded(directory.fullname, out directoryfiles, out directoryfolders)) { filelist.addrange(directoryfiles); directorylist.addrange(directoryfolders); } } }); files = filelist; directories = directorylist; homecoming true; } grab { files = null; directories = null; homecoming false; } }
edit: still 21 seconds using alexei's linked solution's accepted reply mark gravell. non-recursive technique not fastest (probably cost of keeping queue datatype live expensive cost of pushing , popping calls method on stack):
static bool checkfoldernonrecursive(string path, out list<tuple<string, datetime>> files, out list<tuple<string, datetime>> directories) { seek { list<tuple<string, datetime>> filelist = new list<tuple<string, datetime>>(); list<tuple<string, datetime>> directorylist = new list<tuple<string, datetime>>(); concurrentqueue<directoryinfo> pendingsearches = new concurrentqueue<directoryinfo>(); pendingsearches.enqueue(new directoryinfo(path)); directoryinfo pendingdirectory; while (pendingsearches.count > 0) { if (pendingsearches.trydequeue(out pendingdirectory)) { seek { foreach (fileinfo file in pendingdirectory.getfiles()) { filelist.add(new tuple<string, datetime>(file.fullname, file.lastwritetimeutc)); } foreach (directoryinfo directory in pendingdirectory.getdirectories()) { // check reparsepoint flag, indicate symbolic link. if (!directory.attributes.hasflag(fileattributes.reparsepoint)) { directorylist.add(new tuple<string, datetime>(directory.fullname, directory.lastwritetimeutc)); pendingsearches.enqueue(directory); } } } grab { } // ignore directories no access rights. } } files = filelist; directories = directorylist; homecoming true; } grab { files = null; directories = null; homecoming false; } }
edit: question open-ended towards .net, because there may faster way msvc++ libraries boost, have yet come across faster method. if can beat c# method faster c drive enumerator in c++ pulls same data, first of kudos doing faster, sec of interested see it, 3rd of help lot of people out (not myself). got far boost until realized next method took around 200,000 ms, much, much longer code i've posted above:
#include "stdafx.h" #include <iostream> #include <windows.h> #include <boost/filesystem.hpp> #include <boost/foreach.hpp> #include <boost/timer.hpp> namespace fs = boost::filesystem; bool iteratedirectory(const wchar_t *directory); int _tmain(int argc, _tchar* argv[]) { boost::timer timer = boost::timer(); while (true) { timer.restart(); // l makes wide, since iteratedirectory takes wchar_t. // r makes raw string literal, tells compiler parse string as-is, not escape characters , fancy tricks. iteratedirectory(lr"(c:\)"); std::cout << "elapsed time: " << timer.elapsed() * 1000 << " ms" << std::endl; sleep(1000); } homecoming 0; } // iteratedirectory takes wchar_t because path.c_str() returns wchar_t whether using unicode or multibyte. bool iteratedirectory(const wchar_t *directory) { if (boost::filesystem::exists(directory)) { fs::directory_iterator it(directory), eod; boost_foreach(fs::path path, std::make_pair(it, eod)) { seek { if (is_regular_file(path)) { //std::cout << path << ", lastly write time: " << last_write_time(path) << '.' << std::endl; } if (is_directory(path)) { //std::cout << path << ", lastly write time: " << last_write_time(path) << '.' << std::endl; // path.c_str() returns wchar_t, whether using unicode or multibyte. because of multi-language back upwards within of windows operating scheme , file structure. iteratedirectory(path.c_str()); } } grab (...) { } // ignore directories don't have access to. } homecoming true; } homecoming false; }
edit: using pinvoke findfirstfile , findnextfile took 6 seconds iterate entire c drive (thanks duplicate link , sam saffron's answer). but...can faster?
[dllimport("kernel32.dll", charset = charset.unicode, setlasterror = true)] public static extern intptr findfirstfilew(string lpfilename, out win32_find_dataw lpfindfiledata); [dllimport("kernel32.dll", charset = charset.unicode)] public static extern bool findnextfile(intptr hfindfile, out win32_find_dataw lpfindfiledata); [dllimport("kernel32.dll")] public static extern bool findclose(intptr hfindfile); [structlayout(layoutkind.sequential, charset = charset.unicode)] public struct win32_find_dataw { public fileattributes dwfileattributes; internal system.runtime.interopservices.comtypes.filetime ftcreationtime; internal system.runtime.interopservices.comtypes.filetime ftlastaccesstime; internal system.runtime.interopservices.comtypes.filetime ftlastwritetime; public int nfilesizehigh; public int nfilesizelow; public int dwreserved0; public int dwreserved1; [marshalas(unmanagedtype.byvaltstr, sizeconst = 260)] public string cfilename; [marshalas(unmanagedtype.byvaltstr, sizeconst = 14)] public string calternatefilename; } static intptr invalid_handle_value = new intptr(-1); static bool findnextfilepinvokerecursive(string path, out list<tuple<string, datetime>> files, out list<tuple<string, datetime>> directories) { list<tuple<string, datetime>> filelist = new list<tuple<string, datetime>>(); list<tuple<string, datetime>> directorylist = new list<tuple<string, datetime>>(); win32_find_dataw finddata; intptr findhandle = invalid_handle_value; list<tuple<string, datetime>> info = new list<tuple<string,datetime>>(); seek { findhandle = findfirstfilew(path + @"\*", out finddata); if (findhandle != invalid_handle_value) { { if (finddata.cfilename == "." || finddata.cfilename == "..") continue; string fullpath = path + (path.endswith("\\") ? string.empty : "\\") + finddata.cfilename; // check if directory , not symbolic link since symbolic links lead repeated files , folders infinite loops. if (finddata.dwfileattributes.hasflag(fileattributes.directory) && !finddata.dwfileattributes.hasflag(fileattributes.reparsepoint)) { directorylist.add(new tuple<string, datetime>(fullpath, finddata.ftlastwritetime.todatetime())); list<tuple<string, datetime>> subdirectoryfilelist = new list<tuple<string, datetime>>(); list<tuple<string, datetime>> subdirectorydirectorylist = new list<tuple<string, datetime>>(); if (findnextfilepinvokerecursive(fullpath, out subdirectoryfilelist, out subdirectorydirectorylist)) { filelist.addrange(subdirectoryfilelist); directorylist.addrange(subdirectorydirectorylist); } } else if (!finddata.dwfileattributes.hasflag(fileattributes.directory)) { filelist.add(new tuple<string, datetime>(fullpath, finddata.ftlastwritetime.todatetime())); } } while (findnextfile(findhandle, out finddata)); } } grab (exception exception) { console.writeline("caught exception while trying enumerate directory. {0}", exception.tostring()); if (findhandle != invalid_handle_value) findclose(findhandle); files = null; directories = null; homecoming false; } if (findhandle != invalid_handle_value) findclose(findhandle); files = filelist; directories = directorylist; homecoming true; } public static class filetimeextensions { public static datetime todatetime(this system.runtime.interopservices.comtypes.filetime filetime) { long highbits = filetime.dwhighdatetime; highbits = highbits << 32; homecoming datetime.fromfiletimeutc(highbits + (long)filetime.dwlowdatetime); } }
edit: yes, can faster. using techniques parallelize target folder's subdirectory recursions, can 4 seconds using above findnextfilepinvokerecursive method. that's 4 seconds iterate entire c drive info need. can see in process monitor, eat 30% cpu , 1% disk @ most, little odd me, not sure why @ moment, maybe linked list traversal style causes pretty negligible. ideally should eat 100% cpu @ least, might depend on number , depth of subfolders parallelize on. but can faster?!
static bool findnextfilepinvokerecursiveparalleled(string path, out list<tuple<string, datetime>> files, out list<tuple<string, datetime>> directories) { list<tuple<string, datetime>> filelist = new list<tuple<string, datetime>>(); list<tuple<string, datetime>> directorylist = new list<tuple<string, datetime>>(); win32_find_dataw finddata; intptr findhandle = invalid_handle_value; list<tuple<string, datetime>> info = new list<tuple<string, datetime>>(); seek { findhandle = findfirstfilew(path + @"\*", out finddata); if (findhandle != invalid_handle_value) { { if (finddata.cfilename == "." || finddata.cfilename == "..") continue; string fullpath = path + (path.endswith("\\") ? string.empty : "\\") + finddata.cfilename; // check if directory , not symbolic link since symbolic links lead repeated files , folders infinite loops. if (finddata.dwfileattributes.hasflag(fileattributes.directory) && !finddata.dwfileattributes.hasflag(fileattributes.reparsepoint)) { directorylist.add(new tuple<string, datetime>(fullpath, finddata.ftlastwritetime.todatetime())); } else if (!finddata.dwfileattributes.hasflag(fileattributes.directory)) { filelist.add(new tuple<string, datetime>(fullpath, finddata.ftlastwritetime.todatetime())); } } while (findnextfile(findhandle, out finddata)); directorylist.asparallel().forall(x => { list<tuple<string, datetime>> subdirectoryfilelist = new list<tuple<string, datetime>>(); list<tuple<string, datetime>> subdirectorydirectorylist = new list<tuple<string, datetime>>(); if (findnextfilepinvokerecursive(x.item1, out subdirectoryfilelist, out subdirectorydirectorylist)) { filelist.addrange(subdirectoryfilelist); directorylist.addrange(subdirectorydirectorylist); } }); } } grab (exception exception) { console.writeline("caught exception while trying enumerate directory. {0}", exception.tostring()); if (findhandle != invalid_handle_value) findclose(findhandle); files = null; directories = null; homecoming false; } if (findhandle != invalid_handle_value) findclose(findhandle); files = filelist; directories = directorylist; homecoming true; }
edit: forgot add together concurrency locks when using parallels, otherwise might grab exception. removed tuples , went fileinformation/directoryinformation class purposes. shaved off .5 seconds. 3.5 seconds enumerate c: drive.
[dllimport("kernel32.dll", charset = charset.unicode, setlasterror = true)] public static extern intptr findfirstfilew(string lpfilename, out win32_find_dataw lpfindfiledata); [dllimport("kernel32.dll", charset = charset.unicode)] public static extern bool findnextfile(intptr hfindfile, out win32_find_dataw lpfindfiledata); [dllimport("kernel32.dll")] public static extern bool findclose(intptr hfindfile); [structlayout(layoutkind.sequential, charset = charset.unicode)] public struct win32_find_dataw { public fileattributes dwfileattributes; internal system.runtime.interopservices.comtypes.filetime ftcreationtime; internal system.runtime.interopservices.comtypes.filetime ftlastaccesstime; internal system.runtime.interopservices.comtypes.filetime ftlastwritetime; public int nfilesizehigh; public int nfilesizelow; public int dwreserved0; public int dwreserved1; [marshalas(unmanagedtype.byvaltstr, sizeconst = 260)] public string cfilename; [marshalas(unmanagedtype.byvaltstr, sizeconst = 14)] public string calternatefilename; } static intptr invalid_handle_value = new intptr(-1); static bool findnextfilepinvokerecursive(string path, out list<fileinformation> files, out list<directoryinformation> directories) { list<fileinformation> filelist = new list<fileinformation>(); list<directoryinformation> directorylist = new list<directoryinformation>(); win32_find_dataw finddata; intptr findhandle = invalid_handle_value; list<tuple<string, datetime>> info = new list<tuple<string, datetime>>(); seek { findhandle = findfirstfilew(path + @"\*", out finddata); if (findhandle != invalid_handle_value) { { // skip current directory , parent directory symbols returned. if (finddata.cfilename != "." && finddata.cfilename != "..") { string fullpath = path + @"\" + finddata.cfilename; // check if directory , not symbolic link since symbolic links lead repeated files , folders infinite loops. if (finddata.dwfileattributes.hasflag(fileattributes.directory) && !finddata.dwfileattributes.hasflag(fileattributes.reparsepoint)) { directorylist.add(new directoryinformation { fullpath = fullpath, lastwritetime = finddata.ftlastwritetime.todatetime() }); list<fileinformation> subdirectoryfilelist = new list<fileinformation>(); list<directoryinformation> subdirectorydirectorylist = new list<directoryinformation>(); if (findnextfilepinvokerecursive(fullpath, out subdirectoryfilelist, out subdirectorydirectorylist)) { filelist.addrange(subdirectoryfilelist); directorylist.addrange(subdirectorydirectorylist); } } else if (!finddata.dwfileattributes.hasflag(fileattributes.directory)) { filelist.add(new fileinformation { fullpath = fullpath, lastwritetime = finddata.ftlastwritetime.todatetime() }); } } } while (findnextfile(findhandle, out finddata)); } } grab (exception exception) { console.writeline("caught exception while trying enumerate directory. {0}", exception.tostring()); if (findhandle != invalid_handle_value) findclose(findhandle); files = null; directories = null; homecoming false; } if (findhandle != invalid_handle_value) findclose(findhandle); files = filelist; directories = directorylist; homecoming true; } static bool findnextfilepinvokerecursiveparalleled(string path, out list<fileinformation> files, out list<directoryinformation> directories) { list<fileinformation> filelist = new list<fileinformation>(); object filelistlock = new object(); list<directoryinformation> directorylist = new list<directoryinformation>(); object directorylistlock = new object(); win32_find_dataw finddata; intptr findhandle = invalid_handle_value; list<tuple<string, datetime>> info = new list<tuple<string, datetime>>(); seek { path = path.endswith(@"\") ? path : path + @"\"; findhandle = findfirstfilew(path + @"*", out finddata); if (findhandle != invalid_handle_value) { { // skip current directory , parent directory symbols returned. if (finddata.cfilename != "." && finddata.cfilename != "..") { string fullpath = path + finddata.cfilename; // check if directory , not symbolic link since symbolic links lead repeated files , folders infinite loops. if (finddata.dwfileattributes.hasflag(fileattributes.directory) && !finddata.dwfileattributes.hasflag(fileattributes.reparsepoint)) { directorylist.add(new directoryinformation { fullpath = fullpath, lastwritetime = finddata.ftlastwritetime.todatetime() }); } else if (!finddata.dwfileattributes.hasflag(fileattributes.directory)) { filelist.add(new fileinformation { fullpath = fullpath, lastwritetime = finddata.ftlastwritetime.todatetime() }); } } } while (findnextfile(findhandle, out finddata)); directorylist.asparallel().forall(x => { list<fileinformation> subdirectoryfilelist = new list<fileinformation>(); list<directoryinformation> subdirectorydirectorylist = new list<directoryinformation>(); if (findnextfilepinvokerecursive(x.fullpath, out subdirectoryfilelist, out subdirectorydirectorylist)) { lock (filelistlock) { filelist.addrange(subdirectoryfilelist); } lock (directorylistlock) { directorylist.addrange(subdirectorydirectorylist); } } }); } } grab (exception exception) { console.writeline("caught exception while trying enumerate directory. {0}", exception.tostring()); if (findhandle != invalid_handle_value) findclose(findhandle); files = null; directories = null; homecoming false; } if (findhandle != invalid_handle_value) findclose(findhandle); files = filelist; directories = directorylist; homecoming true; } public class fileinformation { public string fullpath; public datetime lastwritetime; } public class directoryinformation { public string fullpath; public datetime lastwritetime; }
edit: b.k. asking conversion datetime filetime:
public static class filetimeextensions { public static datetime todatetime(this system.runtime.interopservices.comtypes.filetime time) { ulong high = (ulong)time.dwhighdatetime; ulong low = (ulong)time.dwlowdatetime; long filetime = (long)((high << 32) + low); homecoming datetime.fromfiletimeutc(filetime); } }
use linq , parallel tasks
var stuff = dir.getfiles("*.*", system.io.searchoption.alldirectories); parallel.foreach(stuff, p=>{ //do things in parrallel.. }); //or var q = stuff.asparallel().where(x => p(x)).orderby(x => k(x)).select(x => f(x)); foreach (var e in q) a(e);
c# .net windows visual-c++ boost
Comments
Post a Comment