public byte[] HashImage(string fileName) { using (var image = new Bitmap(fileName)) { var sha256 = SHA256.Create(); var rect = new Rectangle(0, 0, image.Width, image.Height); var data = image.LockBits(rect, ImageLockMode.ReadOnly, image.PixelFormat); var dataPtr = data.Scan0; var totalBytes = (int)Math.Abs(data.Stride) * data.Height; var rawData = new byte[totalBytes]; System.Runtime.InteropServices.Marshal.Copy(dataPtr, rawData, 0, totalBytes); image.UnlockBits(data); return sha256.ComputeHash(rawData); } } private Tuple<int, string> GetIndexedImage(string fileName) { var baseFileName = Path.GetFileNameWithoutExtension(fileName); int index; if (int.TryParse(baseFileName, out index)) { return Tuple.Create(index, fileName); } return null; } private string HashToString(byte[] hash) { var builder = new StringBuilder(); foreach (var b in hash) { builder.AppendFormat("{0:x2}", b); } return builder.ToString(); }
HashImage() gets the SHA256 Hash of the image as bytes.
GetFileIndex() returns a Tuple of index and file name.
HashToString() is a string builder that will convert the hashes to the string.
private void button1_Click(object sender, EventArgs e) { string original = @"C:\Users\user\Documents\CaptchaCollection\"; var equivalentImages = Directory.GetFiles(original) .Select(f => GetIndexedImage(f)) // build tuples (index, fileName) or null if parsing failed .Where(t => t != null) // ignore all invalid ones .OrderBy(t => t.Item1) // order by index .Select(t => Tuple.Create(HashImage(t.Item2), t.Item2)) // create new tuple (hash, fileName) .GroupBy(t => t.Item1); // group by Hash // print groups foreach (var group in equivalentImages) { Console.WriteLine("All images with hash: {0}", HashToString(group.Key)); foreach (var t in group) { Console.WriteLine("\t{0}", t.Item2); } } }
Now this is like a main method when I click the button I start to get the hashes of the image files.
However, now the main problem is that it seems like when I collect the
output of the hashes from the files, they are not grouped but instead they are printed in order. Instead they are scattered all over the output file.
How do I group any group and duplicated elements with the same hashes as each other?